Name: Murtaza Khalid

Email: khal3470@mylaurier.ca

Project Title: Predicting the Possibility of Cardiovascular Disease in Patients Under 70 




# I - Main Project

## 1 - Import Dependencies and Dataset

Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix,classification_report, roc_auc_score, RocCurveDisplay
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm, metrics
from sklearn.model_selection import GridSearchCV

Dataset

In [None]:
orig_data = pd.read_csv("/content/heart.csv")
# making a mutable copy of the dataset
ds = orig_data.copy()

FileNotFoundError: ignored

## 2 - Visualizing Dataset

In [None]:
# testing if imported correctly - print first 5 rows
ds.head()

In [None]:
# check the number of rows/columns in ds
ds.shape

In [None]:
# check the types of the columns and if there is any null values
ds.info()

In [None]:
# null-value count
ds.isnull().sum()

In [None]:
# stat measures of ds
ds.describe()

In [None]:
HD = ds["HeartDisease"].value_counts()
print(HD)
keys = ["Heart Disease", "Normal"]
count = [HD[1], HD[0]]
plt.bar(keys, count, width=0.5)

In [None]:
# Male/Female HeartDisease distribution
def O1_to_YS(s):
  if s == 0:
    return "N"
  elif s == 1:
    return "Y"

plot_ds = ds
plot_ds["HeartDiseaseYN"] = ds['HeartDisease'].apply(O1_to_YS)
sns.histplot(data=plot_ds, x="HeartDiseaseYN", hue="Sex", multiple="dodge")
ds = ds.drop(columns = "HeartDiseaseYN")

In [None]:
#Age distribution
sns.histplot(data=ds, x="Age", stat="count")

In [None]:
data = ds.groupby('ChestPainType').count()
data.head()
plt.pie(data = data, x="Age", labels = ["ASY", "ATA", "NAP", "TA"], autopct='%.1f%%')
plt.show()

In [None]:
sns.histplot(data=ds, x="ChestPainType", hue="HeartDisease", multiple="dodge")

In [None]:
sns.histplot(data=ds, x="Cholesterol", hue="HeartDisease", stat="count", multiple="stack")

## 3 - Pre-Processing

Turn strings -> integer form

In [None]:
# ChestPainType -> Int
def CPT_to_number(s):
  if s == "TA":
    return 0
  elif s == "ATA":
    return 1
  elif s == "NAP":
    return 2
  elif s == "ASY":
    return 3

# M (Male) or F (Female) -> Int
def MF_to_number(s):
  if s == "M":
    return 0
  elif s == "F":
    return 1

# Y (Yes) or N (No) -> Int
def YN_to_number(s):
  if s == "N":
    return 0
  elif s == "Y":
    return 1

# Normal or ST or LVH -> Int
def NS_to_number(s):
  if s == "Normal":
    return 0
  elif s == "ST":
    return 1
  elif s == "LVH":
    return 2

# Up or Flat or Down -> Int
def UFD_to_number(s):
  if s == "Up":
    return 0
  elif s == "Flat":
    return 1
  elif s == "Down":
    return 2
  

In [None]:
# apply the string to integer functions
ds['ChestPainTypePos'] = ds['ChestPainType'].apply(CPT_to_number)
ds['SexPos'] = ds['Sex'].apply(MF_to_number)
ds['ExerciseAnginaPos'] = ds['ExerciseAngina'].apply(YN_to_number)
ds['RestingECGPos'] = ds['RestingECG'].apply(NS_to_number)
ds['ST_SlopePos'] = ds['ST_Slope'].apply(UFD_to_number)

#Drop all string columns
ds = ds.drop(columns = "ExerciseAngina")
ds = ds.drop(columns = "Sex")
ds = ds.drop(columns = "ChestPainType")
ds = ds.drop(columns = "RestingECG")
ds = ds.drop(columns = "ST_Slope")

In [None]:
plt.figure(figsize = (15,6))
colors = ['#ea230c','#ffe599', '#b2ff03']
sns.heatmap(ds.corr(),cmap = colors,annot = True);

In [None]:
corr = ds.corrwith(ds['HeartDisease']).sort_values(ascending = False).to_frame()
corr.columns = ['Correlations']
plt.subplots(figsize = (5,5))
sns.heatmap(corr,annot = True,cmap = colors,linewidths = 0.4,linecolor = 'black');
plt.title('Correlation with HeartDisease');

Remove outliers found while visualization

In [None]:
# RestingECGPos has 0.061 correlation to HeartDisease so it will be dropped
ds = ds.drop(columns = "RestingECGPos")

Split Features (X) and Label (y)

In [None]:
X = ds.drop(columns="HeartDisease")
y = ds["HeartDisease"]

In [None]:
X.head()

In [None]:
y.head()

## 4 - Modelling

In [None]:
models_used = []
test_acc_results = []
train_acc_results = []

Feature Scaling

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)

Functions Used for Modelling

In [None]:
# Learning Curve
def lcurve(mod_class, X, y):
  
  train_sizes, train_scores, test_scores = learning_curve(mod_class, X, y, cv=10, scoring ='accuracy',
  train_sizes=np.linspace(.1, 1.0, 10))

  # Create mean of train and test scores
  train_mean = np.mean(train_scores, axis=1)
  test_mean = np.mean(test_scores, axis=1)

  # Plot learning curve lines (mean of training and test scores)
  plt.plot(train_sizes, train_mean, '--',  label="Training score")
  plt.plot(train_sizes, test_mean,  label="Cross-validation score")

  # Add title and labels and show the plot
  plt.title("Learning Curve")
  plt.xlabel("Training Set Size")
  plt.ylabel("Accuracy Score")
  plt.legend(loc="best")
  plt.yticks(np.arange(0.75, 1.0, 0.025))
  plt.tight_layout()
  plt.show()

In [None]:
# Validation Curve
def vcurve(mod_class, X, y):

  # Define the range of parameter to be tested
  param_range = np.arange(0.1,10,0.1)
  
  # Calculate accuracy on training and test set using range of parameter values
  train_scores, test_scores = validation_curve(mod_class, X, y, param_name="C", param_range=param_range, cv=5, scoring="accuracy")

  # Calculate mean for training and test scores
  train_mean = np.mean(train_scores, axis=1)
  test_mean = np.mean(test_scores, axis=1)

  # Plot validation curve lines (mean of training and test scores)
  plt.plot(param_range, train_mean, '--',label="Training score")
  plt.plot(param_range, test_mean, label="Cross-validation score")

  # Add title and labels and show the plot
  plt.title("Validation Curve")
  plt.ylim([0.75, 1.0])
  plt.xlabel("Value of regularization term")
  plt.ylabel("Accuracy Score")
  plt.tight_layout()
  plt.show()

In [None]:
# Cross-Validation
def cv_measure(mod_class, X, y, k=10): # default k = 10
  mod_class.fit(X, y)

  print("K=10 Fold Cross Validation:")
  # Perform cross-validation with K=10 (cv=10) and "accuracy" as performance measure
  cv_results = cross_validate(mod_class, X, y, cv=k, scoring ='accuracy')
  
  # Store results
  cv_scores = cv_results['test_score'] 

  # Print cross-validation results
  print("Cross-validation score for each of the folds: ", [float('{:.3f}'.format(x)) for x in cv_scores])
  print("Mean cross-validation score (or cross-validation score): %0.3f (+/- %0.3f)" % (cv_scores.mean(), cv_scores.std() * 2))


In [None]:
def model_measure(mod_class):
  # Train(fit) model
  mod_class.fit(X_train,y_train)
  #print(mod_class.kneighbors)
  y_train_predict = mod_class.predict(X_train)
  y_test_predict = mod_class.predict(X_test)
 
  # Count percentage of correct predictions
  print("The performance of the model:")
  print("（づ￣3￣）づ╭❤️～(⓿_⓿)")
  print("The Log Loss of the model:")
  print('Log loss of the model for training set: %.3f' % log_loss(y_train,y_train_predict))
  print('Log loss of the model for test set: %.3f' % log_loss(y_test,y_test_predict))
  print("--------------------------------------")

  cv_measure(mod_class, X, y)
  print("--------------------------------------")

  print("Accuracy:")
  # Performance of the model
  print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train,y_train_predict))
  print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test,y_test_predict))

  print("--------------------------------------")

  print("Confusion Matrix:")

  # Confusion matrix
  sns.heatmap(confusion_matrix(y_test, y_test_predict),annot=True, fmt = 'g')

  return accuracy_score(y_train,y_train_predict), accuracy_score(y_test,y_test_predict)

### A - Logistic Regression

In [None]:
#split training and test data: 80/20: 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 89)

In [None]:
# Build a logstic regression object
LogReg = LogisticRegression(solver = 'newton-cg')

In [None]:
models_used.append('Logistic Regression')

In [None]:
train_acc, test_acc = model_measure(LogReg)

In [None]:
train_acc_results.append(round(train_acc, 3))
test_acc_results.append(round(test_acc, 3))

In [None]:
# Learning Curve
lcurve(LogReg, X_train, y_train)
  
# Validation Curve 
vcurve(LogReg, X_train, y_train)

### B - KNN Model

In [None]:
#Trial and error for KNN - result 18
"""def model_measure_trials(mod_class):
  # Train(fit) model
  mod_class.fit(X_train,y_train)
  #print(mod_class.kneighbors)
  y_train_predict = mod_class.predict(X_train)
  y_test_predict = mod_class.predict(X_test)
  
  return accuracy_score(y_test,y_test_predict)

best = 0
j = 1
num = 1
while j != 100:
  knn_test = KNeighborsClassifier(n_neighbors = j)
  n = model_measure_trials(knn_test)
  if n > best:
    best = n
    num = j
  j += 1
print(best)
print(num)"""

In [None]:
knn = KNeighborsClassifier(n_neighbors = 18)

In [None]:
models_used.append('KNN')

Training and testing KNN Model using data split

In [None]:
#split training and test data: 85/15: 85% for training and 15% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 89)

In [None]:
train_acc, test_acc = model_measure(knn)

In [None]:
train_acc_results.append(round(train_acc, 3))
test_acc_results.append(round(test_acc, 3))

In [None]:
# Learning Curve
lcurve(knn, X_train, y_train)

### C - SVM Model

In [None]:
#split training and test data: 80/20: 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 89)

In [None]:
for kernel in ['linear', 'poly', 'rbf']:
  classifier = svm.SVC(gamma=0.001, kernel=kernel)
  classifier.fit(X_train, y_train)
  y_train_predict = classifier.predict(X_train)
  y_test_predict = classifier.predict(X_test)
  print('For kerenl {} the f1 score is: {}'.format(kernel, metrics.f1_score(y_test, y_test_predict, average='micro')))

  print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train_predict, y_train))
  print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))

Hyper-Parameter Tuning for SVM

In [None]:
# Hyper Parameters || not picking poly due to significantly low accuracy than the rest
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['linear', 'rbf']}

In [None]:
grid = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print(grid.best_estimator_)
print(grid.score(X_test, y_test))

Optimal SVM model

In [None]:
classifier = svm.SVC(gamma=0.001, kernel='linear', C=10)

In [None]:
models_used.append('SVM')

In [None]:
classifier.fit(X_train, y_train)
y_train_predict = classifier.predict(X_train)
y_test_predict = classifier.predict(X_test)

print('For kernel {} the f1 score is: {}'.format(kernel, metrics.f1_score(y_test, y_test_predict, average='micro')))
print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train, y_train_predict))
print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))

In [None]:
train_acc_results.append(round(accuracy_score(y_train_predict, y_train), 3))
test_acc_results.append(round(accuracy_score(y_test, y_test_predict),3))

In [None]:
# Learning Curve
lcurve(classifier, X_train, y_train)

### D - Random Forest Model

In [None]:
#split training and test data: 80/20: 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 89)

In [None]:
# RF model with all 10 features
rf = RandomForestClassifier(max_features=10, n_estimators=100)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_train_predict = rf.predict(X_train)
y_test_predict = rf.predict(X_test)

print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train, y_train_predict))
print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))

Hyper-Parameter Tuning for Random Forest

In [None]:
max_features_range = np.arange(1,11,1)
n_estimators_range = np.arange(10,210,10)
param_grid = {'max_features': max_features_range, 'n_estimators': n_estimators_range}

grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print(grid.best_estimator_)
print(grid.score(X_test, y_test))

Optimal Random Forest model

In [None]:
rf = RandomForestClassifier(max_features=1, n_estimators=170)

In [None]:
models_used.append('Random Forest')

In [None]:
rf.fit(X_train, y_train)

y_train_predict = rf.predict(X_train)
y_test_predict = rf.predict(X_test)

print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train, y_train_predict))
print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))

In [None]:
train_acc_results.append(round(accuracy_score(y_train_predict, y_train), 3))
test_acc_results.append(round(accuracy_score(y_test, y_test_predict), 3))

In [None]:
# Learning Curve
lcurve(rf, X_train, y_train)

### E - Model Accuracy Graph

In [None]:
mods = np.array(models_used)
train = np.array(train_acc_results)
test = np.array(test_acc_results)
accu_ds = pd.DataFrame({'Classifiers': mods, 'Train Accuracy': train, 'Test Accuracy': test}, columns=['Classifiers', 'Train Accuracy', 'Test Accuracy'])

In [None]:
accu_ds

In [None]:
accx = accu_ds['Classifiers']
accy = accu_ds['Train Accuracy']

plt.bar(accx, accy, color=['red', 'green', 'blue', 'cyan'])

for i in range(len(accx)):
  plt.text(i, accy[i], accy[i], ha="center")

plt.title('Model Training Accuracy')
plt.xlabel('Model')
plt.ylabel('Training Accuracy')


In [None]:
accx = accu_ds['Classifiers']
accy = accu_ds['Test Accuracy']
plt.bar(accx, accy, color=['red', 'green', 'blue', 'cyan'])
for i in range(len(accx)):
  plt.text(i, accy[i], accy[i], ha="center")

plt.title('Model Testing Accuracy')
plt.xlabel('Model')
plt.ylabel('Testing Accuracy')

# II - Using Object Only Features to Predict the Label

In [None]:
orig_data = pd.read_csv("/content/heart.csv")
# making a mutable copy of the dataset
ds2 = orig_data.copy()

In [None]:
models_used = []
test_acc_results = []
train_acc_results = []

In [None]:
# Drop all integer/float based features
ds2 = ds2.drop(columns = ["Age", "RestingBP", "Cholesterol", "FastingBS", "MaxHR", "Oldpeak"])
# Drop RestingECG as correlation is very low
ds2.head()

In [None]:
X = ds2.drop(columns="HeartDisease")
y = ds2["HeartDisease"]

Convert strings to numerics for 

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoder.fit(X)
X = encoder.transform(X)

## Logistic Regression

In [None]:
#split training and test data: 80/20: 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 89)

In [None]:
# Build a logstic regression object
LogReg = LogisticRegression(solver = 'newton-cg')

In [None]:
models_used.append('Logistic Regression')

In [None]:
train_acc, test_acc = model_measure(LogReg)

In [None]:
train_acc_results.append(round(train_acc, 3))
test_acc_results.append(round(test_acc, 3))

In [None]:
# Learning Curve
lcurve(LogReg, X_train, y_train)
  
# Validation Curve 
vcurve(LogReg, X_train, y_train)

## KNN Model

In [None]:
#split training and test data: 85/15: 85% for training and 15% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 89)

In [None]:
#Trial and error for KNN - result 17
"""def model_measure_trials(mod_class):
  # Train(fit) model
  mod_class.fit(X_train,y_train)
  #print(mod_class.kneighbors)
  y_train_predict = mod_class.predict(X_train)
  y_test_predict = mod_class.predict(X_test)
  
  return accuracy_score(y_test,y_test_predict)

best = 0
j = 1
num = 1
while j != 100:
  knn_test = KNeighborsClassifier(n_neighbors = j)
  n = model_measure_trials(knn_test)
  if n > best:
    best = n
    num = j
  j += 1
print(best)
print(num)"""

In [None]:
knn = KNeighborsClassifier(n_neighbors = 17)

In [None]:
models_used.append('KNN')

In [None]:
train_acc, test_acc = model_measure(knn)

In [None]:
train_acc_results.append(round(train_acc, 3))
test_acc_results.append(round(test_acc, 3))

## SVM Model

In [None]:
#split training and test data: 80/20: 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 89)

In [None]:
for kernel in ['linear', 'poly', 'rbf']:
  classifier = svm.SVC(gamma=0.001, kernel=kernel)
  classifier.fit(X_train, y_train)
  y_train_predict = classifier.predict(X_train)
  y_test_predict = classifier.predict(X_test)
  print('For kerenl {} the f1 score is: {}'.format(kernel, metrics.f1_score(y_test, y_test_predict, average='micro')))

  print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train_predict, y_train))
  print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))

In [None]:
# Hyper Parameters || not picking poly due to significantly low accuracy than the rest
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['linear', 'rbf']}

In [None]:
grid = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print(grid.best_estimator_)
print(grid.score(X_test, y_test))

In [None]:
classifier = svm.SVC(gamma=0.001, kernel='linear', C=10)

In [None]:
models_used.append('SVM')

In [None]:
classifier.fit(X_train, y_train)
y_train_predict = classifier.predict(X_train)
y_test_predict = classifier.predict(X_test)

print('For kernel {} the f1 score is: {}'.format(kernel, metrics.f1_score(y_test, y_test_predict, average='micro')))
print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train, y_train_predict))
print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))

In [None]:
train_acc_results.append(round(accuracy_score(y_train_predict, y_train), 3))
test_acc_results.append(round(accuracy_score(y_test, y_test_predict),3))

## Random Forest Model

In [None]:
#split training and test data: 80/20: 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 89)

In [None]:
# RF model with all 10 features
rf = RandomForestClassifier(max_features=10, n_estimators=100)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_train_predict = rf.predict(X_train)
y_test_predict = rf.predict(X_test)

print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train, y_train_predict))
print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))

In [None]:
max_features_range = np.arange(1,11,1)
n_estimators_range = np.arange(10,210,10)
param_grid = {'max_features': max_features_range, 'n_estimators': n_estimators_range}

grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print(grid.best_estimator_)
print(grid.score(X_test, y_test))

In [None]:
rf = RandomForestClassifier(max_features=8, n_estimators=120)

In [None]:
models_used.append('Random Forest')

In [None]:
rf.fit(X_train, y_train)

y_train_predict = rf.predict(X_train)
y_test_predict = rf.predict(X_test)

print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train, y_train_predict))
print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))



In [None]:
train_acc_results.append(round(accuracy_score(y_train_predict, y_train), 3))
test_acc_results.append(round(accuracy_score(y_test, y_test_predict), 3))

## Model Accuracy Graph

In [None]:
mods = np.array(models_used)
train = np.array(train_acc_results)
test = np.array(test_acc_results)
accu_ds = pd.DataFrame({'Classifiers': mods, 'Train Accuracy': train, 'Test Accuracy': test}, columns=['Classifiers', 'Train Accuracy', 'Test Accuracy'])

In [None]:
accu_ds

In [None]:
accx = accu_ds['Classifiers']
accy = accu_ds['Train Accuracy']

plt.bar(accx, accy, color=['red', 'green', 'blue', 'cyan'])

for i in range(len(accx)):
  plt.text(i, accy[i], accy[i], ha="center")

plt.title('Model Training Accuracy')
plt.xlabel('Model')
plt.ylabel('Training Accuracy')


In [None]:
accx = accu_ds['Classifiers']
accy = accu_ds['Test Accuracy']
plt.bar(accx, accy, color=['red', 'green', 'blue', 'cyan'])
for i in range(len(accx)):
  plt.text(i, accy[i], accy[i], ha="center")

plt.title('Model Testing Accuracy')
plt.xlabel('Model')
plt.ylabel('Testing Accuracy')

# III - Using Numeric Only Features to Predict the Label

In [None]:
orig_data = pd.read_csv("/content/heart.csv")
# making a mutable copy of the dataset
ds3 = orig_data.copy()

In [None]:
models_used = []
test_acc_results = []
train_acc_results = []

In [None]:
# Drop all string columns
ds3 = ds3.drop(columns = "ExerciseAngina")
ds3 = ds3.drop(columns = "Sex")
ds3 = ds3.drop(columns = "ChestPainType")
ds3 = ds3.drop(columns = "RestingECG")
ds3 = ds3.drop(columns = "ST_Slope")

In [None]:
X = ds3.drop(columns="HeartDisease")
y = ds3["HeartDisease"]

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)

## Logistic Regression

In [None]:
#split training and test data: 80/20: 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 89)

In [None]:
# Build a logstic regression object
LogReg = LogisticRegression(solver = 'newton-cg')

In [None]:
models_used.append('Logistic Regression')

In [None]:
train_acc, test_acc = model_measure(LogReg)

In [None]:
train_acc_results.append(round(train_acc, 3))
test_acc_results.append(round(test_acc, 3))

In [None]:
# Learning Curve
lcurve(LogReg, X_train, y_train)
  
# Validation Curve 
vcurve(LogReg, X_train, y_train)

## KNN Model

In [None]:
#split training and test data: 85/15: 85% for training and 15% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 89)

In [None]:
#Trial and error for KNN - result 18
"""def model_measure_trials(mod_class):
  # Train(fit) model
  mod_class.fit(X_train,y_train)
  #print(mod_class.kneighbors)
  y_train_predict = mod_class.predict(X_train)
  y_test_predict = mod_class.predict(X_test)
  
  return accuracy_score(y_test,y_test_predict)

best = 0
j = 1
num = 1
while j != 100:
  knn_test = KNeighborsClassifier(n_neighbors = j)
  n = model_measure_trials(knn_test)
  if n > best:
    best = n
    num = j
  j += 1
print(best)
print(num)"""

In [None]:
knn = KNeighborsClassifier(n_neighbors = 18)

In [None]:
models_used.append('KNN')

In [None]:
train_acc, test_acc = model_measure(knn)

In [None]:
train_acc_results.append(round(train_acc, 3))
test_acc_results.append(round(test_acc, 3))

In [None]:
# Learning Curve
lcurve(knn, X_train, y_train)

## SVM Model

In [None]:
#split training and test data: 80/20: 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 89)

In [None]:
for kernel in ['linear', 'poly', 'rbf']:
  classifier = svm.SVC(gamma=0.001, kernel=kernel)
  classifier.fit(X_train, y_train)
  y_train_predict = classifier.predict(X_train)
  y_test_predict = classifier.predict(X_test)
  print('For kerenl {} the f1 score is: {}'.format(kernel, metrics.f1_score(y_test, y_test_predict, average='micro')))

  print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train_predict, y_train))
  print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))

In [None]:
# Hyper Parameters || not picking poly due to significantly low accuracy than the rest
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['linear', 'rbf']}

In [None]:
grid = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print(grid.best_estimator_)
print(grid.score(X_test, y_test))

In [None]:
classifier = svm.SVC(gamma=0.001, kernel='linear', C=10)

In [None]:
models_used.append('SVM')

In [None]:
classifier.fit(X_train, y_train)
y_train_predict = classifier.predict(X_train)
y_test_predict = classifier.predict(X_test)

print('For kernel {} the f1 score is: {}'.format(kernel, metrics.f1_score(y_test, y_test_predict, average='micro')))
print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train, y_train_predict))
print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))

In [None]:
train_acc_results.append(round(accuracy_score(y_train_predict, y_train), 3))
test_acc_results.append(round(accuracy_score(y_test, y_test_predict),3))

## Random Forest Model

In [None]:
#split training and test data: 80/20: 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 89)

In [None]:
# RF model with all 10 features
rf = RandomForestClassifier(max_features=5, n_estimators=100)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_train_predict = rf.predict(X_train)
y_test_predict = rf.predict(X_test)

print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train, y_train_predict))
print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))

In [None]:
max_features_range = np.arange(1,11,1)
n_estimators_range = np.arange(10,210,10)
param_grid = {'max_features': max_features_range, 'n_estimators': n_estimators_range}

grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print(grid.best_estimator_)
print(grid.score(X_test, y_test))

In [None]:
rf = RandomForestClassifier(max_features=1, n_estimators=130)

In [None]:
models_used.append('Random Forest')

In [None]:
rf.fit(X_train, y_train)

y_train_predict = rf.predict(X_train)
y_test_predict = rf.predict(X_test)

print('Accuracy of the model for training set: %.3f' % accuracy_score(y_train, y_train_predict))
print('Accuracy of the model for test set: %.3f' % accuracy_score(y_test, y_test_predict))



In [None]:
train_acc_results.append(round(accuracy_score(y_train_predict, y_train), 3))
test_acc_results.append(round(accuracy_score(y_test, y_test_predict), 3))

## Model Accuracy Graph

In [None]:
mods = np.array(models_used)
train = np.array(train_acc_results)
test = np.array(test_acc_results)
accu_ds = pd.DataFrame({'Classifiers': mods, 'Train Accuracy': train, 'Test Accuracy': test}, columns=['Classifiers', 'Train Accuracy', 'Test Accuracy'])

In [None]:
accu_ds

In [None]:
accx = accu_ds['Classifiers']
accy = accu_ds['Train Accuracy']

plt.bar(accx, accy, color=['red', 'green', 'blue', 'cyan'])

for i in range(len(accx)):
  plt.text(i, accy[i], accy[i], ha="center")

plt.title('Model Training Accuracy')
plt.xlabel('Model')
plt.ylabel('Training Accuracy')


In [None]:
accx = accu_ds['Classifiers']
accy = accu_ds['Test Accuracy']
plt.bar(accx, accy, color=['red', 'green', 'blue', 'cyan'])
for i in range(len(accx)):
  plt.text(i, accy[i], accy[i], ha="center")

plt.title('Model Testing Accuracy')
plt.xlabel('Model')
plt.ylabel('Testing Accuracy')