In [71]:
from scipy.io import arff
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

In [72]:
# loading the ARFF data to a Pandas DataFrame
data = arff.loadarff('/home/jovyan/data/Dataset.arff')
heart_data = pd.DataFrame(data[0])

In [73]:
# print first 5 rows of the dataset
heart_data.head()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,type,obesity,alcohol,age,chd
0,160.0,12.0,5.73,23.11,1.0,49.0,25.3,97.2,52.0,2.0
1,144.0,0.01,4.41,28.61,2.0,55.0,28.87,2.06,63.0,2.0
2,118.0,0.08,3.48,32.28,1.0,52.0,29.14,3.81,46.0,1.0
3,170.0,7.5,6.41,38.03,1.0,51.0,31.99,24.26,58.0,2.0
4,134.0,13.6,3.5,27.78,1.0,60.0,25.99,57.34,49.0,2.0


In [74]:
# print last 5 rows of the dataset
heart_data.tail()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,type,obesity,alcohol,age,chd
457,214.0,0.4,5.98,31.72,2.0,64.0,28.45,0.0,58.0,1.0
458,182.0,4.2,4.41,32.1,2.0,52.0,28.61,18.72,52.0,2.0
459,108.0,3.0,1.59,15.23,2.0,40.0,20.09,26.64,55.0,1.0
460,118.0,5.4,11.61,30.79,2.0,64.0,27.35,23.97,40.0,1.0
461,132.0,0.0,4.82,33.41,1.0,62.0,14.7,0.0,46.0,2.0


In [75]:
# number of rows and columns in the dataset
heart_data.shape

(462, 10)

In [76]:
# getting some info about the data
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sbp        462 non-null    float64
 1   tobacco    462 non-null    float64
 2   ldl        462 non-null    float64
 3   adiposity  462 non-null    float64
 4   famhist    462 non-null    float64
 5   type       462 non-null    float64
 6   obesity    462 non-null    float64
 7   alcohol    462 non-null    float64
 8   age        462 non-null    float64
 9   chd        462 non-null    float64
dtypes: float64(10)
memory usage: 36.2 KB


In [77]:
# checking for missing values
heart_data.isnull().sum()

sbp          0
tobacco      0
ldl          0
adiposity    0
famhist      0
type         0
obesity      0
alcohol      0
age          0
chd          0
dtype: int64

In [78]:
# statistical measures about the data
heart_data.describe()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,type,obesity,alcohol,age,chd
count,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0
mean,138.32684,3.635649,4.740325,25.406732,1.584416,53.103896,26.044113,17.044394,42.816017,1.34632
std,20.496317,4.593024,2.070909,7.780699,0.493357,9.817534,4.21368,24.481059,14.608956,0.476313
min,101.0,0.0,0.98,6.74,1.0,13.0,14.7,0.0,15.0,1.0
25%,124.0,0.0525,3.2825,19.775,1.0,47.0,22.985,0.51,31.0,1.0
50%,134.0,2.0,4.34,26.115,2.0,53.0,25.805,7.51,45.0,1.0
75%,148.0,5.5,5.79,31.2275,2.0,60.0,28.4975,23.8925,55.0,2.0
max,218.0,31.2,15.33,42.49,2.0,78.0,46.58,147.19,64.0,2.0


In [79]:
# checking the distribution of Target Variable
heart_data['chd'].value_counts()

1.0    302
2.0    160
Name: chd, dtype: int64

In [80]:
X = heart_data.drop(columns='chd', axis=1)
Y = heart_data['chd']

In [81]:
print(X)

       sbp  tobacco    ldl  adiposity  famhist  type  obesity  alcohol   age
0    160.0    12.00   5.73      23.11      1.0  49.0    25.30    97.20  52.0
1    144.0     0.01   4.41      28.61      2.0  55.0    28.87     2.06  63.0
2    118.0     0.08   3.48      32.28      1.0  52.0    29.14     3.81  46.0
3    170.0     7.50   6.41      38.03      1.0  51.0    31.99    24.26  58.0
4    134.0    13.60   3.50      27.78      1.0  60.0    25.99    57.34  49.0
..     ...      ...    ...        ...      ...   ...      ...      ...   ...
457  214.0     0.40   5.98      31.72      2.0  64.0    28.45     0.00  58.0
458  182.0     4.20   4.41      32.10      2.0  52.0    28.61    18.72  52.0
459  108.0     3.00   1.59      15.23      2.0  40.0    20.09    26.64  55.0
460  118.0     5.40  11.61      30.79      2.0  64.0    27.35    23.97  40.0
461  132.0     0.00   4.82      33.41      1.0  62.0    14.70     0.00  46.0

[462 rows x 9 columns]


In [82]:
print(Y)

0      2.0
1      2.0
2      1.0
3      2.0
4      2.0
      ... 
457    1.0
458    2.0
459    1.0
460    1.0
461    2.0
Name: chd, Length: 462, dtype: float64


In [83]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.9, stratify=Y, random_state=2)

In [84]:
print(X.shape, X_train.shape, X_test.shape)

(462, 9) (46, 9) (416, 9)


In [85]:
model = LogisticRegression()

In [86]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [87]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [88]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.8043478260869565


In [89]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [90]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.6875
