## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## Importing the dataset

In [2]:
dataset = pd.read_csv('logreg_train_test.csv')

In [3]:
dataset

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3806,0,52,2.0,1,20.0,0.0,0,0,0,410.0,105.0,67.5,27.33,75.0,90.0,0
3807,0,43,4.0,0,0.0,0.0,0,0,0,223.0,100.0,70.0,22.73,63.0,68.0,0
3808,1,65,1.0,1,6.0,0.0,0,1,0,238.0,146.0,86.0,29.47,75.0,66.0,0
3809,1,50,3.0,0,0.0,0.0,0,1,1,232.0,148.5,94.0,25.78,80.0,88.0,1


## Handling the missing values

In [4]:
dataset.isna().any().any()

True

In [5]:
print(dataset.isnull().sum())

male                 0
age                  0
education           96
currentSmoker        0
cigsPerDay          26
BPMeds              48
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             46
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            357
TenYearCHD           0
dtype: int64


In [6]:
strategies = {'education': 'mean',
              'cigsPerDay': 'mean',
              'BPMeds': 'most_frequent',
              'totChol': 'mean',
              'BMI': 'mean',
              'heartRate': 'mean',
              'glucose': 'mean'}


imputers = {}
for col, strat in strategies.items():
    if strat == 'constant':
        imputers[col] = SimpleImputer(strategy=strat, fill_value=0)
    else:
        imputers[col] = SimpleImputer(strategy=strat)
        

                
for col, imputer in imputers.items():
    dataset[[col]] = imputer.fit_transform(dataset[[col]])

  mode = stats.mode(array)


In [7]:
dataset.isna().any().any()

False

In [8]:
print(dataset.isnull().sum())

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64


## Splitting the dataset into the Training set and Test set

In [9]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [10]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

In [11]:
len(x)

3811

In [12]:
len(x_train)

3048

In [13]:
len(x_test)

763

## Training the Logistic Regression model on the Training set

In [14]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

## Model evaluation

In [15]:
y_pred = classifier.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.86


## Predicting results with validation set

In [16]:
valset = pd.read_csv('logreg_val.csv')

In [17]:
valset

Unnamed: 0,id,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
0,1,1,35,2.0,1,20.0,0.0,0,0,0,223.0,128.0,82.0,19.99,80,67.0
1,2,0,61,1.0,1,9.0,0.0,0,0,0,252.0,119.0,77.0,23.20,65,65.0
2,3,0,44,3.0,0,0.0,0.0,0,0,0,257.0,129.0,93.0,27.56,75,76.0
3,4,1,41,4.0,1,30.0,0.0,0,0,0,289.0,109.0,74.0,25.80,70,86.0
4,5,1,60,1.0,0,0.0,0.0,0,0,0,266.0,115.5,82.5,23.68,82,83.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,423,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66,86.0
423,424,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65,68.0
424,425,0,48,2.0,1,20.0,,0,0,0,248.0,131.0,72.0,22.00,84,86.0
425,426,0,44,1.0,1,15.0,0.0,0,0,0,210.0,126.5,87.0,19.16,86,


In [18]:
print(valset.isnull().sum())

id                  0
male                0
age                 0
education           9
currentSmoker       0
cigsPerDay          3
BPMeds              5
prevalentStroke     0
prevalentHyp        0
diabetes            0
totChol             4
sysBP               0
diaBP               0
BMI                 0
heartRate           0
glucose            31
dtype: int64


In [19]:
strategies = {'education': 'mean',
              'cigsPerDay': 'mean',
              'BPMeds': 'most_frequent',
              'totChol': 'mean',
              'BMI': 'mean',
              'heartRate': 'mean',
              'glucose': 'mean'}

imputers = {}
for col, strat in strategies.items():
    if strat == 'constant':
        imputers[col] = SimpleImputer(strategy=strat, fill_value=0)
    else:
        imputers[col] = SimpleImputer(strategy=strat)

        
for col, imputer in imputers.items():
    valset[[col]] = imputer.fit_transform(valset[[col]])

  mode = stats.mode(array)


In [20]:
print(valset.isnull().sum())

id                 0
male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
dtype: int64


In [21]:
x_val = valset.iloc[:, 1:].values

In [22]:
y_vpred = classifier.predict(x_val)
y_vpred=y_vpred.reshape(len(x_val),1)
print(y_vpred)

[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]


In [23]:
np.savetxt("Jackson-Iterators-exercise1.csv", y_vpred, delimiter=",")

In [24]:
pd.DataFrame(y_vpred).to_csv("Jackson-Iterators-exercise1.csv")