In [1]:
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline

# machine learning
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
dataset_url = 'https://raw.githubusercontent.com/KHSDTC/Hackathon_Autumn2020_Challenge/master/challenge_dataset/train_data_imputed.csv'
dataset = pd.read_csv(dataset_url, index_col=0)

In [5]:
features =list(dataset.columns)
features.remove('mortstat')
target= ['mortstat']

X = dataset[features]
y = dataset[target]

# Spliting data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

In [7]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_val)

acc_log =accuracy_score(y_val, Y_pred)
acc_log

0.7358490566037735

In [8]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, y_train)
Y_pred = svc.predict(X_val)
acc_svc = accuracy_score(y_val, Y_pred)
acc_svc

0.6855345911949685

In [9]:
# Nearest Neighbors

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_val)
acc_knn = accuracy_score(y_val, Y_pred)
acc_knn

0.5974842767295597

In [20]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
Y_pred = gaussian.predict(X_val)
acc_gaussian = accuracy_score(y_val, Y_pred)
acc_gaussian

0.6352201257861635

In [19]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, y_train)
Y_pred = perceptron.predict(X_val)
acc_perceptron = accuracy_score(y_val, Y_pred)
acc_perceptron

0.7044025157232704

In [18]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
Y_pred = linear_svc.predict(X_val)
acc_linear_svc = accuracy_score(y_val, Y_pred)
acc_linear_svc

0.5974842767295597

In [23]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
Y_pred = sgd.predict(X_val)
acc_sgd = accuracy_score(y_val, Y_pred)
acc_sgd

0.5345911949685535

In [24]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
Y_pred = decision_tree.predict(X_val)
acc_decision_tree = accuracy_score(y_val, Y_pred)
acc_decision_tree

0.6666666666666666

In [25]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=50)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_val)
random_forest.score(X_train, y_train)
acc_random_forest = accuracy_score(y_val, Y_pred)
acc_random_forest

0.7169811320754716

In [28]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Accuracy': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy
2,Logistic Regression,0.735849
3,Random Forest,0.716981
5,Perceptron,0.704403
0,Support Vector Machines,0.685535
8,Decision Tree,0.666667
4,Naive Bayes,0.63522
1,KNN,0.597484
7,Linear SVC,0.597484
6,Stochastic Gradient Decent,0.534591
