# REFINING LOGISTIC REGRESSION

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [61]:
data = pd.read_csv("pima-indians-diabetes.data.csv", names= ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'])

In [62]:
data.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [63]:
# Mark zero values as missing or NaN
data[['preg', 'plas', 'pres', 'skin', 'test']] = data[['preg', 'plas', 'pres', 'skin', 'test']].replace(0, np.NaN)
# Count the number of NaN values in each column
print(data.isnull().sum())

preg     111
plas       5
pres      35
skin     227
test     374
mass       0
pedi       0
age        0
class      0
dtype: int64


In [64]:
# Fill missing values with mean column values
data.fillna(data.mean(), inplace=True)
# Count the number of NaN values in each column
print(data.isnull().sum())

preg     0
plas     0
pres     0
skin     0
test     0
mass     0
pedi     0
age      0
class    0
dtype: int64


In [67]:
data.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,4.494673,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [74]:
# split values into inpits and outputs
values = data.values
X = values[:,0:8]
y = values[:,8]

In [77]:
# Initiate the LR model with random hyperparameters
lr = LogisticRegression(penalty='l1',dual=False,max_iter=110)

In [81]:
lr.fit(X,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=110,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [83]:
lr.score(X,y)

0.7786458333333334

In [89]:
# You will need the following dependencies for applying Cross-validation and evaluating the cross-validated score

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [92]:
# Build the k-fold cross-validator
kfold = KFold(n_splits=3, random_state=7)

In [93]:
result = cross_val_score(lr, X, y, cv=kfold, scoring="accuracy")



In [94]:
print(result.mean())

0.7708333333333334


In [95]:
from sklearn.model_selection import GridSearchCV

In [111]:
dual = [True, False]
max_iter = [100, 110, 120, 130, 140, 150]

In [112]:
param_grid = dict(dual=dual, max_iter=max_iter)

In [113]:
import time

In [114]:
lr = LogisticRegression(penalty='l2')

In [119]:
grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv=3, n_jobs=-1)
start_time = time.time()
grid_result = grid.fit(X,y)
#summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print(time.time() - start_time)

Best: 0.755208 using {'dual': False, 'max_iter': 100}
0.29881954193115234




In [126]:
dual=[True,False]
max_iter=[100,110,120,130,140]
C = [1.0,1.5,2.0,2.5]
tol = [1e-10, 1e-4, 1e-3, 1e-2, 1e-1]
param_grid = dict(dual=dual,max_iter=max_iter,C=C, tol=tol)

In [127]:
lr = LogisticRegression(penalty='l2')
grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv = 3, n_jobs=-1)

start_time = time.time()
grid_result = grid.fit(X, y)
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Best: 0.772135 using {'C': 2.5, 'dual': False, 'max_iter': 100, 'tol': 1e-10}
Execution time: 2.270602226257324 ms




In [122]:
from sklearn.model_selection import RandomizedSearchCV

In [123]:
random = RandomizedSearchCV(estimator=lr, param_distributions=param_grid, cv = 3, n_jobs=-1)

start_time = time.time()
random_result = random.fit(X, y)
# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Best: 0.772135 using {'max_iter': 100, 'dual': False, 'C': 2.5}
Execution time: 0.20087480545043945 ms




# REFINING DECISION TREE ALGORITHM

In [2]:
# In the case of a random forest, hyperparameters include the number of decision trees in the forest and
# the number of features considered by each tree when splitting a node.
# For hyperparameter tuning, we perform many iterations of the entire K-Fold CV process, 
# each time using different model settings. We then compare all of the models, select the best one, 
# train it on the full training set, and then evaluate on the testing set.

In [3]:
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Set seed to 1 for reproducibility
SEED = 1

# Instantiate a DecisionTreeClassifier 'dt'
dt = DecisionTreeClassifier(random_state=SEED)

# Print out 'dt's hyperparameters
print(dt.get_params())

{'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': 1, 'splitter': 'best'}


In [4]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the grid of hyperparameters 'params_dt'
params_dt = {
'max_depth': [3, 4,5, 6],
'min_samples_leaf': [0.04, 0.06, 0.08],
'max_features': [0.2, 0.4,0.6, 0.8]
}

# Instantiate a 10-fold CV grid search object 'grid_dt'
grid_dt = GridSearchCV(estimator=dt,
param_grid=params_dt,
scoring=
'accuracy'
,
cv=10,
n_jobs=-1)
# Fit 'grid_dt' to the training data
grid_dt.fit(X_train, y_train)

NameError: name 'X_train' is not defined