# Data scaling

### Importing modules, loading dataset and breaking it into X and y

In [139]:
# NumPy
import numpy as np

# Dataframe operations
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Scalers
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction

# GridSearchCV
from sklearn.model_selection import GridSearchCV

%matplotlib inline

# Loading data:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

## 0. Загрузка чистых X и y 

In [140]:
X = train_df.drop('Survived', 1)
y = train_df['Survived']

## 1. Шкалирование в [0, 1]

In [141]:
X = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)

## 2. Стандартизация

In [121]:
# Standardizing ALL features:
X = StandardScaler().fit_transform(X)

# Standardizing all but Categorical features:
# FORMULA :: dfTest[['A', 'B']] = scaler.fit_transform(dfTest[['A', 'B']])
# temp = train_df
# temp[['Pclass','Age','Family_Size']] = StandardScaler().fit_transform(train_df[['Pclass','Age','Family_Size']])
# X = temp.drop('Survived', 1)
# y = temp['Survived']

## Оценка на группе различных моделей

In [122]:
kfold = KFold(n_splits=10, random_state=22) # k=10, split the data into 10 equal parts
xyz=[]
accuracy=[]
std=[]
classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN','Decision Tree','Naive Bayes','Random Forest']
models=[svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),KNeighborsClassifier(n_neighbors=8),
        DecisionTreeClassifier(),GaussianNB(),RandomForestClassifier(n_estimators=100)]
for i in models:
    model = i
    cv_result = cross_val_score(model, X, y, cv = kfold, scoring = "accuracy")
    cv_result = cv_result
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)
new_models_dataframe = pd.DataFrame({'CV Mean':xyz,'Std':std},index=classifiers)       
new_models_dataframe

Unnamed: 0,CV Mean,Std
Linear Svm,0.832747,0.028307
Radial Svm,0.851835,0.033319
Logistic Regression,0.840624,0.025045
KNN,0.847391,0.029208
Decision Tree,0.830499,0.023974
Naive Bayes,0.824931,0.028845
Random Forest,0.835031,0.017285


## Оценка на ансамблях

In [65]:
ensemble_lin_rbf=VotingClassifier(estimators=[('KNN',KNeighborsClassifier(n_neighbors=8)),
                                              ('RBF',svm.SVC(probability=True,kernel='rbf',C=0.5,gamma=0.1)),
                                              ('RFor',RandomForestClassifier(n_estimators=500,random_state=0)),
                                              ('LR',LogisticRegression(C=0.05)),
                                              ('DT',DecisionTreeClassifier(random_state=0)),
                                              ('NB',GaussianNB()),
                                              ('svm',svm.SVC(kernel='linear',probability=True))
                                             ], 
                       voting='soft').fit(X, y)
cross=cross_val_score(ensemble_lin_rbf, X, y, cv = 10,scoring = "accuracy")
print('Ensamble CV score score =',cross.mean())

Ensamble CV score score = 0.849597094541


In [66]:
ada=AdaBoostClassifier(n_estimators=200,random_state=0,learning_rate=0.1)
result=cross_val_score(ada,X,y,cv=10,scoring='accuracy')
print('AdaBoost CV score =',result.mean())

AdaBoost CV score = 0.844066791511


### KNN
Для стандартизированных данных лучшие параметры KNN:<br>
**0.858585858586**<br>
KNeighborsClassifier(algorithm='auto', leaf_size=6, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

In [110]:
n_neighbors = list(range(1,30,1))
algo = ['auto','ball_tree','kd_tree'] # used; best seems to be auto
weights = ['uniform', 'distance'] # used
leaf_size = list(range(1,50,5)) # used;
hyperparams = {'n_neighbors': n_neighbors, 'weights': weights}
gd=GridSearchCV(estimator = KNeighborsClassifier(), param_grid = hyperparams, verbose=True, cv=10)
gd.fit(X, y)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 10 folds for each of 58 candidates, totalling 580 fits
0.855218855219
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=27, p=2,
           weights='uniform')


[Parallel(n_jobs=1)]: Done 580 out of 580 | elapsed:   12.2s finished


### LogisticRegression

In [143]:
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
C = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.1, 1.3, 1.5] # C=0.3?
tol = [0.000001, 0.00001, 0.0001, 0.001,0.01,0.1] # C=0.1?
hyperparams = {'solver': solver, 'tol': tol, 'C':C}
gd=GridSearchCV(estimator = LogisticRegression(), param_grid = hyperparams, verbose=True, cv=10)
gd.fit(X, y)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 10 folds for each of 270 candidates, totalling 2700 fits
0.846240179574
LogisticRegression(C=0.3, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.1,
          verbose=0, warm_start=False)


[Parallel(n_jobs=1)]: Done 2700 out of 2700 | elapsed:   21.1s finished


<br>
<br>
<br>
## Применение модели и запись данных

In [128]:
X_test = test_df
X_test = StandardScaler().fit_transform(X_test)

best_model = KNeighborsClassifier(algorithm='auto', leaf_size=6, metric='minkowski', 
                                  metric_params=None, n_jobs=1, n_neighbors=6, p=2, 
                                  weights='uniform')

best_model.fit(X, y)
y_pred = best_model.predict(X_test)


test_csv_df = pd.read_csv("test.csv")
submission = pd.DataFrame({
        "PassengerId": test_csv_df["PassengerId"],
        "Survived": y_pred})
submission.to_csv('submission.csv', index=False)