# Models Work

In [1]:
! rm -r project_chd/

In [2]:
! git clone https://github.com/JulianKrese/project_chd/

Cloning into 'project_chd'...
remote: Enumerating objects: 79, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 79 (delta 26), reused 2 (delta 2), pack-reused 29[K
Receiving objects: 100% (79/79), 2.05 MiB | 6.49 MiB/s, done.
Resolving deltas: 100% (35/35), done.


In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeClassifier

In [4]:
# getting training sets
X_train = pd.read_csv("/content/project_chd/X_train_cleaned.csv")
y_train = pd.read_csv("/content/project_chd/y_train_cleaned.csv")

# getting test sets
X_test = pd.read_csv("/content/project_chd/X_test_cleaned.csv")
y_test = pd.read_csv("/content/project_chd/y_test_cleaned.csv")

In [5]:
# keep track of all R-squared values found
R_squared_vals = {}

In [6]:
# Create kNN model - k of 1 gives the highest R-squared
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train.values.ravel()) # .values.ravel() suppresses data conversion warning - converts column vector to 1d array
y_pred = knn.predict(X_test)

r2 = r2_score(y_test, y_pred)
print("kNN R^2:", abs(r2))
R_squared_vals["kNN"] = abs(r2)

kNN R^2: 0.8478334119549835


In [7]:
# Create decision tree - depth of 24 gives the highest R-squared
dt = DecisionTreeClassifier(max_depth=24, random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

r2 = r2_score(y_test, y_pred)
print("Decision tree R^2:", abs(r2))
R_squared_vals["Decision tree"] = abs(r2)

Decision tree R^2: 0.8742310321257691


In [8]:
# trying linear regression with all variables
stock_reg = LinearRegression().fit(X_train, y_train)
r2 = abs(stock_reg.score(X_test, y_test))
print("R^2 (all variables) = ", r2)
R_squared_vals["Linear regression"] = r2

# trying linear regression with min-max normalizing
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)
norm_reg = LinearRegression().fit(X_train_norm, y_train)
r2 = abs(norm_reg.score(X_test_norm, y_test))
print("R^2 (min-max normalized): ", r2)
R_squared_vals["Max-min normalized"] = r2

R^2 (all variables) =  0.09069686607234828
R^2 (min-max normalized):  0.09069686607234895


In [9]:
# trying linear regression with different correlated variables (finds col w/ highest R-squared)
corr_matrix = X_train.corr()
best_r2 = 0
best_col = ""
for col in corr_matrix:
  high_corr_vars = corr_matrix.index[abs(corr_matrix[col]) > 0.5]
  X_train_corr = X_train[high_corr_vars]
  X_test_corr = X_test[high_corr_vars]
  corr_reg = LinearRegression().fit(X_train_corr, y_train)
  r2 = abs(corr_reg.score(X_test_corr, y_test))
  if r2 > best_r2:
    best_r2 = r2
    best_col = col
print(f"R^2 (correlated vars on {best_col}): {best_r2}")
R_squared_vals[f"Correlated vars on {best_col}"] = best_r2

R^2 (correlated vars on prevalentHyp): 0.06374168477694897


In [10]:
# trying linear regression with polynomial expanders (tests multiple degrees to find best R-squared)
best_r2 = 0
best_degree = 0
for d in range(4):
  poly = PolynomialFeatures(degree=d)
  X_train_poly = poly.fit_transform(X_train)
  X_test_poly = poly.transform(X_test)
  poly_reg = LinearRegression().fit(X_train_poly, y_train)
  r2 = abs(poly_reg.score(X_test_poly, y_test))
  if (r2 > best_r2) and (r2 <= 1):
    best_r2 = r2
    best_degree = d
print(f"R^2 (polynomial features, degree={best_degree}): ", best_r2)
R_squared_vals["Polynomial expanders"] = best_r2

R^2 (polynomial features, degree=1):  0.09069686607234839


In [11]:
# print all R-squared vals found
print(R_squared_vals)

{'kNN': 0.8478334119549835, 'Decision tree': 0.8742310321257691, 'Linear regression': 0.09069686607234828, 'Max-min normalized': 0.09069686607234895, 'Correlated vars on prevalentHyp': 0.06374168477694897, 'Polynomial expanders': 0.09069686607234839}


In [12]:
# find max
max(R_squared_vals.items(), key=lambda k: k[1])

('Decision tree', 0.8742310321257691)

It appears that the most effective form of predictive algorithm is the Decision Tree Classifier, with the R^2 value coming out to be 0.87 after training at a max depth of 24.