In [8]:

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

# Load data
data = pd.read_csv('CausesOfDeath_France_2001-2008.csv')

# Preprocessing
# replace ":" characters in Value column with 0 and remove these rows in 4 next line 
data.Value[data.Value == ":"] = "0"
# remove space characters in Value column and convert str to integer
data["Value"] = data["Value"].apply(lambda x: int(x.replace(" ", "")))
data = data.loc[data["Value"]!=0]

# delete columns
data = data.drop("GEO", axis=1)
data = data.drop("UNIT", axis=1)
data = data.drop("AGE", axis=1)
data = data.drop("Flag and Footnotes", axis=1)

# create features based on feature of ICD10 values
data = pd.get_dummies(data=data, drop_first=True)
jensiat = data.SEX_Males
data = data.drop("SEX_Males", axis=1)
data["SEX"] = jensiat

# classify data
classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(),
    RandomForestClassifier()]

# train and test splits
x_train = data.iloc[0:889,0:-1] # 2001 to 2007
y_train = data.iloc[0:889,-1]
x_test = data.iloc[889::, 0:-1]# 2008
y_test = data.iloc[889::, -1]

# run classifiers
for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    # Calculate Accuracy Rate by using accuracy_score()
    print('---------------------------------------')
    print (clf.__class__.__name__ + " test Accuracy Rate is: %f" % accuracy_score(y_test, y_pred))



---------------------------------------
KNeighborsClassifier test Accuracy Rate is: 0.598425
---------------------------------------
DecisionTreeClassifier test Accuracy Rate is: 0.818898
---------------------------------------
RandomForestClassifier test Accuracy Rate is: 0.905512
