In [121]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
import flask
import os

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [62]:
train_df = pd.read_csv('train_longevity.csv')
test_df = pd.read_csv('test_longevity.csv')
combine = [train_df, test_df]

In [11]:
# Вывод названий столбцов в обучающем датасете
print("train = ", train_df.columns.values)
print("test = ", test_df.columns.values)

train =  ['Id' 'Longevity' 'Education' 'Sex' 'Age' 'Pet' 'Children' 'Region'
 'Activity' 'MedExam' 'Sport']
test =  ['Id' 'Education' 'Sex' 'Age' 'Pet' 'Children' 'Region' 'Activity'
 'MedExam' 'Sport' 'Longevity']


In [12]:
# Удалим из таблиц MedExam, Region, Id
train_df = train_df.drop(columns=['Id', 'Region', 'MedExam'])
test_df = test_df.drop(columns=['Region', 'MedExam', 'Longevity'])

In [13]:
# Вывод названий столбцов в обучающем датасете
print("train = ", train_df.columns.values)
print("test = ", test_df.columns.values)

train =  ['Longevity' 'Education' 'Sex' 'Age' 'Pet' 'Children' 'Activity' 'Sport']
test =  ['Id' 'Education' 'Sex' 'Age' 'Pet' 'Children' 'Activity' 'Sport']


In [14]:
# Выведем Age 
print (train_df['Age'].value_counts(dropna=False))
print ("--------------------------------------")
print (test_df['Age'].value_counts(dropna=False))

NaN       177
73        162
74        154
72         93
75         90
76         63
71         37
77         30
70         24
78         19
79          6
80          1
#ЗНАЧ!      1
Name: Age, dtype: int64
--------------------------------------
73        117
NaN        93
74         66
75         46
72         41
76         36
77         17
71         13
78         13
70          6
#ЗНАЧ!      3
80          1
Name: Age, dtype: int64


In [15]:
# Удалим некорректные значения ('#ЗНАЧ!')
idmax = train_df['Age'].value_counts().mean()

train_df['Age'].fillna(idmax, inplace=True)
test_df['Age'].fillna(idmax, inplace=True)

train_df['Age'] = train_df['Age'].map(lambda v: idmax if (v == '#ЗНАЧ!') else v).astype(int)
test_df['Age'] = train_df['Age'].map(lambda v: idmax if (v == '#ЗНАЧ!') else v).astype(int)


print(train_df['Age'])
print('----------------------------')
print(test_df['Age'])

0      73
1      75
2      73
3      74
4      74
       ..
852    71
853    72
854    76
855    72
856    76
Name: Age, Length: 857, dtype: int32
----------------------------
0      73
1      75
2      73
3      74
4      74
       ..
447    74
448    71
449    77
450    75
451    56
Name: Age, Length: 452, dtype: int32


In [16]:
# Провека данных Sport
print (train_df['Sport'].value_counts(dropna=False))

NaN    621
+      161
-       75
Name: Sport, dtype: int64


In [17]:
def check(v):
    if v == '+':
        return 2;
    else:
        return 1;
    
    
#  train_df['Sport'].fillna(1, inplace=True)
# test_df['Sport'].fillna(1, inplace=True)

train_df['Sport'] = train_df['Sport'].map(lambda v: check(v)).astype(int)
test_df['Sport'] = test_df['Sport'].map(lambda v: check(v)).astype(int)

In [18]:
print (train_df['Sport'].value_counts(dropna=False)) # Дополнили 
print (test_df['Sport'].value_counts(dropna=False)) # Дополнили

1    696
2    161
Name: Sport, dtype: int64
1    343
2    109
Name: Sport, dtype: int64


In [19]:
# Дополнили в Sport 
idmax = train_df['Sport'].value_counts().idxmax()
train_df['Sport'] = train_df['Sport'].map(lambda v: idmax if (v != 1 and v != 2) else v).astype(int)
test_df['Sport'] = test_df['Sport'].map(lambda v: idmax if (v != 1 and v != 2) else v).astype(int)

In [20]:
train_df['Sex'] = train_df['Sex'].map(lambda v: 0 if v == 'male' else 1).astype(int)
test_df['Sex'] = test_df['Sex'].map(lambda v:  0 if v == 'male' else 1).astype(int)

print (train_df['Sex'].value_counts(dropna=False)) # Дополнили 
print (test_df['Sex'].value_counts(dropna=False)) # Дополнили

1    557
0    300
Name: Sex, dtype: int64
1    286
0    166
Name: Sex, dtype: int64


In [21]:
 def activity_convert(act):
        if 0 < act < 1000:
            return 0;
        elif 5500 > act >= 1000:
            return 1;
        else:
            return 2;
        
train_df['Activity'] = train_df['Activity'].map(lambda v: activity_convert(v)).astype(int)
test_df['Activity'] = test_df['Activity'].map(lambda v: activity_convert(v)).astype(int)

In [22]:
def superFunctionYOU_ALWAYS_WELCOME(age):
    if age <= 70:
        return 0;
    elif 70 < age <= 72:
        return 1;
    elif 72 < age <= 74:
        return 2;
    elif 74 < age <= 76:
        return 3;
    elif 76 < age <= 78:
        return 4;
    elif 78 < age <= 80:
        return 5;
    elif 80 < age:
        return 6;
    
criteria = [(superFunctionYOU_ALWAYS_WELCOME(test_df.at[i, 'Age'])) for i in range (test_df['Age'].count())]
test_df['Age'] = pd.Series(criteria)
criteria = [(superFunctionYOU_ALWAYS_WELCOME(train_df.at[i, 'Age'])) for i in range (train_df['Age'].count())]
train_df['Age'] = pd.Series(criteria)

In [23]:
# Создадим новые колонки Family 
family = [(test_df.at[i, 'Children'] + test_df.at[i, 'Pet']) for i in range(test_df['Pet'].count())]
# test_df['Family'] = family
isalone = [(0 if family[i] == 0 else 1) for i in range(len(family))]
test_df['IsAlone'] = isalone
status = [(test_df.at[i, 'Education']*test_df.at[i, 'Age']) for i in range(test_df['Age'].count())]
test_df['Age*Education'] = pd.Series(status)

In [24]:
print (test_df['Age'].value_counts(dropna=False))
print('-----------------------------------')
print (test_df['Age*Education'].value_counts(dropna=False))
print('-----------------------------------')
print (test_df['IsAlone'].value_counts(dropna=False))
# print('-----------------------------------')
# print (test_df['Family'].value_counts(dropna=False))


2    174
0    108
3     75
1     70
4     23
5      2
Name: Age, dtype: int64
-----------------------------------
0     108
6     102
2      65
3      50
9      47
4      45
1      17
12     13
8       3
15      1
5       1
Name: Age*Education, dtype: int64
-----------------------------------
0    275
1    177
Name: IsAlone, dtype: int64


In [25]:
family = [(train_df.at[i, 'Children'] + train_df.at[i, 'Pet']) for i in range(train_df['Pet'].count())]
# train_df['Family'] = family
isalone = [(0 if family[i] == 0 else 1) for i in range(len(family))]
train_df['IsAlone'] = isalone
status = [(train_df.at[i, 'Education']*train_df.at[i, 'Age']) for i in range(train_df['Age'].count())]
train_df['Age*Education'] = pd.Series(status)

In [26]:
print (train_df['Age'].value_counts(dropna=False))
print('-----------------------------------')
print (train_df['Age*Education'].value_counts(dropna=False))
print('-----------------------------------')
print (train_df['IsAlone'].value_counts(dropna=False))
# print('-----------------------------------')
# print (train_df['Family'].value_counts(dropna=False))

2    316
0    202
3    153
1    130
4     49
5      7
Name: Age, dtype: int64
-----------------------------------
6     212
0     202
3     152
4     112
2      83
9      53
1      18
8      13
12      5
5       4
15      2
10      1
Name: Age*Education, dtype: int64
-----------------------------------
0    515
1    342
Name: IsAlone, dtype: int64


In [27]:
train_df = train_df.drop(columns=['Pet', 'Children'])
test_df = test_df.drop(columns=['Pet', 'Children'])

In [28]:
train_df.head()  

Unnamed: 0,Longevity,Education,Sex,Age,Activity,Sport,IsAlone,Age*Education
0,0,3,1,2,0,1,1,6
1,1,1,0,3,2,2,1,3
2,1,3,0,2,0,1,0,6
3,1,1,0,2,1,1,1,2
4,0,3,1,2,0,1,0,6


In [29]:
test_df.head()  

Unnamed: 0,Id,Education,Sex,Age,Activity,Sport,IsAlone,Age*Education
0,858,1,1,2,1,1,0,2
1,859,3,0,3,1,2,1,9
2,860,3,1,2,0,2,0,6
3,861,3,1,2,1,1,1,6
4,862,2,1,2,1,1,1,4


In [34]:
# DAY 3 
feature_names = train_df.columns.tolist()
for column in feature_names:
    print(column)
    print(train_df[column].value_counts(dropna=False))

Longevity
0    528
1    329
Name: Longevity, dtype: int64
Education
3    473
1    208
2    176
Name: Education, dtype: int64
Sex
1    557
0    300
Name: Sex, dtype: int64
Age
2    316
0    202
3    153
1    130
4     49
5      7
Name: Age, dtype: int64
Activity
1    394
0    309
2    154
Name: Activity, dtype: int64
Sport
1    696
2    161
Name: Sport, dtype: int64
IsAlone
0    515
1    342
Name: IsAlone, dtype: int64
Age*Education
6     212
0     202
3     152
4     112
2      83
9      53
1      18
8      13
12      5
5       4
15      2
10      1
Name: Age*Education, dtype: int64


In [40]:
logreg = LogisticRegression(solver='liblinear')

Y_train = train_df['Longevity']
copy_train_df = train_df.drop(columns=['Longevity'])
X_train = copy_train_df

X_test = test_df.drop(columns=['Id'])

logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

79.0

In [41]:
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
4,Sport,0.479057
3,Activity,0.116641
5,IsAlone,0.049168
2,Age,-0.014486
6,Age*Education,-0.079251
0,Education,-0.696698
1,Sex,-2.441211


In [94]:
svmdata = SVC(kernel="linear")
svmdata.fit(X_train, Y_train)
Y_pred = svmdata.predict(X_test)
acc_svc = round(svmdata.score(X_train, Y_train) * 100, 2)
acc_svc

78.65

In [95]:
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(svmdata.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
6,Region,0.000125
3,Age,2.4e-05
5,Children,-7e-06
4,Pet,-2.9e-05
0,Longevity,-8.1e-05
2,Sex,-0.000119
1,Education,-1.999798
7,Activity,
8,MedExam,
9,Sport,


In [96]:
KNC = KNeighborsClassifier()
KNC.fit(X_train, Y_train)
Y_pred = KNC.predict(X_test)
acc_knn = round(KNC.score(X_train, Y_train) * 100, 2)
acc_knn

82.26

In [97]:
gb = GaussianNB()
gb.fit(X_train, Y_train)
Y_pred = gb.predict(X_test)
acc_gaussian = round(gb.score(X_train, Y_train) * 100, 2)
acc_gaussian

76.66

In [98]:
pc = Perceptron()
pc.fit(X_train, Y_train)
Y_pred = pc.predict(X_test)
acc_perceptron = round(pc.score(X_train, Y_train) * 100, 2)
acc_perceptron

77.13

In [99]:
sg = SGDClassifier()
sg.fit(X_train, Y_train)
Y_pred = sg.predict(X_test)
acc_sgd = round(sg.score(X_train, Y_train) * 100, 2)
acc_sgd

79.23

In [100]:
tr = DecisionTreeClassifier()
tr.fit(X_train, Y_train)
Y_pred = tr.predict(X_test)
acc_decision_tree = round(tr.score(X_train, Y_train) * 100, 2)
acc_decision_tree

84.48

In [104]:
ens = RandomForestClassifier()
ens.fit(X_train, Y_train)
Y_pred = ens.predict(X_test)
acc_random_forest = round(ens.score(X_train, Y_train) * 100, 2)
acc_random_forest

84.48

In [105]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
2,Logistic Regression,84.48
3,Random Forest,84.48
7,Decision Tree,84.48
1,KNN,82.26
6,Stochastic Gradient Decent,79.23
0,Support Vector Machines,78.65
5,Perceptron,77.13
4,Naive Bayes,76.66


In [124]:
# Для сервера