# ПОСТРОЕНИЕ ПРЕДСКАЗАТЕЛЬНОЙ МОДЕЛИ ВОЗНИКНОВЕНИЯ ИНСУЛЬТА

В этой лабораторной работе вам потребуется выбрать наилучший классификатор с оптимальными параметрами для задачи про пассажиров "Титаника" -- 
В данной работе будет построена предсказательная модель возникновения инсульта на основе данных о [пациентах](https://www.kaggle.com/fedesoriano/stroke-prediction-dataset) c kaggle.

1. Загрузка данных и библиотек

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import sklearn.model_selection as model_selection

%matplotlib inline

train = pd.read_csv("train.csv") # lab5_input
test = pd.read_csv("test.csv") # lab5_input

2. Предобработка данных

In [None]:
def process_age(df, cut_points, label_names):
    df['Age'] = df['Age'].replace(np.NaN, -0.5) #fillna
    df['Age_categories'] = pd.cut(df['Age'], bins=cut_points, labels=label_names)
    return df

cut_points = [-1, 0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", "Infant", "Child", "Teenager", "Young_Adult", "Adult", "Senior"]
train = process_age(train, cut_points, label_names)
test = process_age(test, cut_points, label_names)

In [None]:
def create_dummies(df, column_name):
    dum = pd.get_dummies(df[column_name], prefix=[column_name])
    df = pd.concat([df,dum],axis=1)
    return df

train = create_dummies(train, "Pclass")
test = create_dummies(test, "Pclass")

train = create_dummies(train, "Sex")
test = create_dummies(test, "Sex")

train = create_dummies(train, "Age_categories")
test = create_dummies(test, "Age_categories")

#train = create_dummies(train, "Embarked")
#test = create_dummies(test, "Embarked")

__Задание 3.__  
Примените масштабирование признаков (`StandardScaler`, `MinMaxScaler`).

In [None]:
drop_cols = ['Name', 'Sex', 'PassengerId', 'Age', 'Age_categories', 'Ticket', 'Cabin', 'Embarked', 'Pclass']
drop_cols = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age',
                 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_categories'] 
train = train.drop(columns=drop_cols, axis=1)
test = test.drop(columns=drop_cols, axis=1)
#print(train[train.isnull().any(axis=1)])

#features = ['Parch', 'SibSp', 'Fare']
#train[features]= MinMaxScaler().fit_transform(train[features])
#test[features]= MinMaxScaler().fit_transform(test[features])

__Задание 4.__  
Примените различные преобразования признаков (`PolynomialFeatures`).

In [None]:

tpoly = PolynomialFeatures(degree=3, interaction_only=True)
tpoly3 = tpoly.fit_transform(train)

fpoly = PolynomialFeatures(degree=3, interaction_only=False)
fpoly3 = fpoly.fit_transform(train)

__Задание 5.__  
Обучите несколько классификаторов, в том числе:  
1. Логистическую регрессию (`LogisticRegression`).
1. Метод опорных векторов (`SVC`).
1. Метод *k* ближайших соседей (`KNeighborsClassifier`).
1. Наивный байесовский классификатор (`MultinomialNB`).
1. Деревья решений (`DecisionTreeClassifier`).
1. Случайный лес (`RandomForestClassifier`).
1. AdaBoost (`AdaBoost`).
1. Градиентный бустинг (`GradientBoostingClassifier`).

Для обучения и проверки качества можно использовать функцию `train_test_split()`.

In [None]:
y = train['Survived']
df = train.copy()
df = df.drop(['Survived'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(df, y, train_size=0.8, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) 

(712, 12) (179, 12) (712,) (179,)


In [None]:
# Логистическая регрессия
lr = LogisticRegression(random_state=42)
lr.fit(x_train, y_train) 
y_pred = lr.predict(x_test)
print(lr.score(x_test, y_test))

0.8044692737430168


In [None]:
# Метод опорных векторов (SVC)
svc = SVC(random_state=42)
svc.fit(x_train, y_train)
print(svc.score(x_test, y_test))

0.8156424581005587


In [None]:
# Метод k ближайших соседей (KNeighborsClassifier)
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
print(knn.score(x_test, y_test))

0.8100558659217877


In [None]:
# Наивный байесовский классификатор (MultinomialNB)
gaussian = MultinomialNB()
gaussian.fit(x_train, y_train)
print(gaussian.score(x_test, y_test))

0.770949720670391


In [None]:
# Деревья решений (DecisionTreeClassifier)
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(x_train, y_train)
print(decision_tree.score(x_test, y_test))

0.8044692737430168


In [None]:
# Случайный лес (RandomForestClassifier)
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(x_train, y_train)
print(random_forest.score(x_test, y_test))

0.8044692737430168


In [None]:
# AdaBoost (AdaBoost)
ada = AdaBoostClassifier(random_state=42)
ada.fit(x_train, y_train)
print(ada.score(x_test, y_test))

0.8044692737430168


In [None]:
# Градиентный бустинг (GradientBoostingClassifier)
gbc = GradientBoostingClassifier(random_state=42).fit(x_train, y_train)
print(gbc.score(x_test, y_test))

0.8100558659217877


__Задание 6.__  
При помощи `Pipeline` и `GridSearchCV` выберите оптимальную архитектуру:
1. Метод масштабирования.
1. Степень полинома в `PolynomialFeatures`.
1. Параметры классификаторов (в том числе, параметры регуляризации).

Заносите в таблицу Excel результаты тестирования (варианты параметров, оценки качества).

In [None]:

classifiers = [
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(),
    MultinomialNB(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]

# LogisticRegression()
params1 = {"poly__degree": range(1,4,1), "clf__solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
           "clf__max_iter": range(700,1000,100)} # range start stop step
# SVC
params2 = {"poly__degree": range(1,4,1),"clf__kernel":["rbf", "poly", "linear", "sigmoid"], "clf__gamma": ["auto", "scale"], "clf__degree":range(1,6,1)}
# KNeighborsClassifier()
params3 = {"poly__degree": range(1,4,1),"clf__n_neighbors": range(5,10,2), "clf__weights": ["uniform", "distance"],
         "clf__algorithm": ['ball_tree', 'kd_tree']}
# MultinomialNB()
params4 = {"poly__degree": range(1,4,1),"clf__fit_prior": ['True', 'False'], "clf__alpha": [0.5,0.3,0.1]}
# DecisionTreeClassifier()
params5 = {"poly__degree": range(1,4,1),"clf__criterion":["gini", "entropy"], "clf__max_depth": range(2,30,3), "clf__min_samples_split": [2,3],
           "clf__min_samples_leaf": range(1,10,3)}   
# RandomForestClassifier()
params6 = {"poly__degree": range(1,4,1),"clf__criterion":["gini", "entropy"], "clf__class_weight":["balanced", "balanced_subsample"], 
           "clf__max_depth": range(4,10,2), "clf__min_samples_leaf": [1,2,5], "clf__n_estimators":range(50,150,50)} 
# AdaBoostClassifier()
params7 = {"poly__degree": range(3,6,1),"clf__learning_rate":np.arange(0.3,0.7,0.1), "clf__n_estimators":range(20,60,20),
         "clf__algorithm":["SAMME", "SAMME.R"]} 
# GradientBoostingClassifier()
params8 = {"poly__degree": range(3,5,1),"clf__loss":["deviance", "exponential"], 
           "clf__learning_rate":[0.08,0.07,0.075], "clf__n_estimators":range(60,120,20),
           "clf__min_samples_leaf": range(1,4,1), "clf__max_depth": range(1,4,1), 
           "clf__max_features": ["auto", "sqrt", "log2"], "clf__validation_fraction":np.arange(0.01,0.2,0.05)}


parameters = [params1, params2, params3, params4, params5, params6, params7, params8]
i = 0
for classifier in classifiers:
    print(classifier)
    pipe = Pipeline(steps=[('scaler', MinMaxScaler()), #MinMaxScaler() StandardScaler()
                            ('poly', PolynomialFeatures()),
                            ('clf', classifier) # , return_train_score = True
                      ])
    grid_cv = model_selection.GridSearchCV(pipe, parameters[i], scoring='accuracy')
    grid_cv.fit(x_train, y_train)
    print("model best score: ", grid_cv.best_score_)    
    print(grid_cv.best_params_)
    model = grid_cv.best_estimator_
    print("model test score: ", model.score(x_test, y_test))
    i+=1

__Задание 7.__  
1. Выберите несколько лучших классификаторов (от 3 до 10).
1. Обучите выбранные классификаторы на всех доступных размеченных данных.
1. Получите результаты предсказания для тестовых данных.
1. Отправьте результаты на сервер [Kaggle](https://ru.wikipedia.org/wiki/Титаник).

In [None]:
y = train['Survived']
df = train.copy()
df = df.drop(['Survived'], axis=1)
test2 = pd.read_csv("test.csv") 
test_ids = test2["PassengerId"]

# RandomForestClassifier()
rfc = {"poly__degree": range(1,4,1),"clf__criterion":["gini", "entropy"], "clf__class_weight":["balanced", "balanced_subsample"], 
           "clf__max_depth": range(2,5,1), "clf__min_samples_leaf": [1,2,3], "clf__n_estimators":range(50,150,50)} 
pipe = Pipeline(steps=[     ('scaler', StandardScaler()), #MinMaxScaler() StandardScaler()
                            ('poly', PolynomialFeatures()),
                            ('clf', RandomForestClassifier()) 
                      ])
grid_cv = model_selection.GridSearchCV(pipe, rfc, scoring='accuracy')
grid_cv.fit(df, y)
print("model best score: ", grid_cv.best_score_)    
print(grid_cv.best_params_)
model = grid_cv.best_estimator_

test_predictions = model.predict(test)

submission_df = {"PassengerId": test_ids,
                 "Survived": test_predictions}

submission = pd.DataFrame(submission_df)
submission.to_csv('titanic_submission_rf.csv', index=False)

model best score:  0.8080911430544223
{'clf__class_weight': 'balanced_subsample', 'clf__criterion': 'entropy', 'clf__max_depth': 3, 'clf__min_samples_leaf': 2, 'clf__n_estimators': 50, 'poly__degree': 2}


In [None]:
# AdaBoostClassifier()
ada = {"poly__degree": range(1,5,1),"clf__learning_rate":np.arange(0.3,0.7,0.1), "clf__n_estimators":range(10,50,10),
         "clf__algorithm":["SAMME", "SAMME.R"]} 
pipe = Pipeline(steps=[     ('scaler', MinMaxScaler()), #MinMaxScaler() StandardScaler()
                            ('poly', PolynomialFeatures()),
                            ('clf', AdaBoostClassifier()) 
                      ])        
grid_cv = model_selection.GridSearchCV(pipe, ada, scoring='accuracy')
grid_cv.fit(df, y)
print("model best score: ", grid_cv.best_score_)    
print(grid_cv.best_params_)
model = grid_cv.best_estimator_

test_predictions = model.predict(test)

submission_df = {"PassengerId": test_ids,
                 "Survived": test_predictions}

submission = pd.DataFrame(submission_df)
submission.to_csv('titanic_submission_ada.csv', index=False)

model best score:  0.8002761910740066
{'clf__algorithm': 'SAMME.R', 'clf__learning_rate': 0.5, 'clf__n_estimators': 20, 'poly__degree': 3}


In [None]:
# GradientBoostingClassifier()
gb = {"poly__degree": [2,3],"clf__loss":["deviance", "exponential"], 
           "clf__learning_rate":[0.05,0.06], "clf__n_estimators":[20,30],
           "clf__min_samples_leaf": [1,2], "clf__max_depth": [3,4]}

pipe = Pipeline(steps=[     ('scaler', StandardScaler()), #MinMaxScaler() StandardScaler()
                            ('poly', PolynomialFeatures()),
                            ('clf', GradientBoostingClassifier()) 
                      ])
grid_cv = model_selection.GridSearchCV(pipe, gb, scoring='accuracy')
grid_cv.fit(df, y)
print("model best score: ", grid_cv.best_score_)    
print(grid_cv.best_params_)
model = grid_cv.best_estimator_

test_predictions = model.predict(test)

submission_df = {"PassengerId": test_ids,
                 "Survived": test_predictions}

submission = pd.DataFrame(submission_df)
submission.to_csv('titanic_submission_gb.csv', index=False)

model best score:  0.8024982738057874
{'clf__learning_rate': 0.05, 'clf__loss': 'exponential', 'clf__max_depth': 3, 'clf__min_samples_leaf': 2, 'clf__n_estimators': 20, 'poly__degree': 3}
