# Data scaling

### Importing modules, loading dataset and breaking it into X and y

In [64]:
# NumPy
import numpy as np

# Dataframe operations
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Scalers
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction

# GridSearchCV
from sklearn.model_selection import GridSearchCV

%matplotlib inline

# Loading data:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

## 0. Загрузка чистых X и y 

In [59]:
X = train_df.drop('Survived', 1)
y = train_df['Survived']

## 1. Шкалирование в [0, 1]

In [54]:
X = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)

## 2. Стандартизация

In [57]:
X = StandardScaler().fit_transform(X)

## Оценка на группе различных моделей

In [58]:
kfold = KFold(n_splits=10, random_state=22) # k=10, split the data into 10 equal parts
xyz=[]
accuracy=[]
std=[]
classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN','Decision Tree','Naive Bayes','Random Forest']
models=[svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),KNeighborsClassifier(n_neighbors=8),
        DecisionTreeClassifier(),GaussianNB(),RandomForestClassifier(n_estimators=100)]
for i in models:
    model = i
    cv_result = cross_val_score(model, X, y, cv = kfold, scoring = "accuracy")
    cv_result = cv_result
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)
new_models_dataframe = pd.DataFrame({'CV Mean':xyz,'Std':std},index=classifiers)       
new_models_dataframe

Unnamed: 0,CV Mean,Std
Linear Svm,0.832747,0.028307
Radial Svm,0.851835,0.033319
Logistic Regression,0.840624,0.025045
KNN,0.848514,0.029222
Decision Tree,0.828277,0.018877
Naive Bayes,0.824931,0.028845
Random Forest,0.832821,0.024925


## Оценка на ансамблях

In [47]:
ensemble_lin_rbf=VotingClassifier(estimators=[('KNN',KNeighborsClassifier(n_neighbors=8)),
                                              ('RBF',svm.SVC(probability=True,kernel='rbf',C=0.5,gamma=0.1)),
                                              ('RFor',RandomForestClassifier(n_estimators=500,random_state=0)),
                                              ('LR',LogisticRegression(C=0.05)),
                                              ('DT',DecisionTreeClassifier(random_state=0)),
                                              ('NB',GaussianNB()),
                                              ('svm',svm.SVC(kernel='linear',probability=True))
                                             ], 
                       voting='soft').fit(X, y)
cross=cross_val_score(ensemble_lin_rbf, X, y, cv = 10,scoring = "accuracy")
print('Ensamble CV score score =',cross.mean())

Ensamble mean CV score score = 0.849622347066


In [63]:
ada=AdaBoostClassifier(n_estimators=200,random_state=0,learning_rate=0.1)
result=cross_val_score(ada,X,y,cv=10,scoring='accuracy')
print('AdaBoost CV score =',result.mean())

AdaBoost CV score = 0.844066791511


In [67]:
n_neighbors = list(range(1,30,1))
hyperparams = {'n_neighbors': n_neighbors}
gd=GridSearchCV(estimator = KNeighborsClassifier(), param_grid = hyperparams, verbose=True, cv=10)
gd.fit(X, y)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 10 folds for each of 29 candidates, totalling 290 fits
0.800224466891
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


[Parallel(n_jobs=1)]: Done 290 out of 290 | elapsed:    2.2s finished
