According to my OneNote notes.

## 0. Loading dataframes

In [9]:
# NumPy
import numpy as np

# Dataframe operations
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

## 1. Deleting "PassengerID", "Embarked" and "Cabin"

In [11]:
train_df.drop(['PassengerId', 'Embarked', 'Cabin'], 1, inplace=True)
test_df.drop(['PassengerId', 'Embarked', 'Cabin'], 1, inplace=True)

## 2. Adding "Majority" feature and removing "Ticket"

In [12]:
# CREATING NEW FEATURE.

# 1. Creating temporary dataframe
duplicates = train_df['Ticket'].duplicated(keep=False)
duplicates.name = 'Duplicate'
temporary_df = pd.concat([train_df['Survived'], train_df['Ticket'], duplicates], axis=1)
temporary_df['Majority'] = 0


# 2. Constructing Majority feature for TRAIN_DF:
majority_list = []
for index, row in temporary_df.iterrows():
    z = temporary_df.loc[temporary_df['Ticket'] == row['Ticket']]
    if row['Duplicate'] == False:
        majority_list.append(0)
        continue
    else:
        survived_guys = z['Survived'].sum()
        dead_guys = len(z) - survived_guys
        if survived_guys >= dead_guys:
            majority_list.append(1)
        else:
            majority_list.append(-1)
temporary_df['Majority'] = majority_list

# 3. Adding Majority feature to TRAIN_DF:
train_df['Majority'] = majority_list

# 4. Constructing Majority feature for TEST_DF:
majority_list = []
for index, row in test_df.iterrows():
    # a. Ticket is not encountered in TRAIN dataframe: no info. Set majority = 0.
    if row['Ticket'] not in train_df['Ticket'].values:
        majority_list.append(0)
        continue
    # Amounts of occurences of a particular Ticket value in TRAIN and TEST dataframes:
    occurences_in_train_df = train_df[train_df['Ticket'] == row['Ticket']].shape[0]
    occurences_in_test_df = test_df[test_df['Ticket'] == row['Ticket']].shape[0]
    
    # b. Ticket is encountered more than once in TRAIN dataframe: set majority accordinlgy. 
    if occurences_in_train_df > 1:
        majority = train_df.loc[train_df['Ticket'] == row['Ticket']]['Majority'].iloc[0]
        majority_list.append(majority)
    # c. Ticket is encountered once in TRAIN dataframe and more than once in TEST dataframe:
    elif occurences_in_train_df == 1 and occurences_in_test_df > 1:
        majority_list.append(0)
    # d. Ticket is encountered once in TRAIN dataframe and once in TEST dataframe:
    elif occurences_in_train_df == 1 and occurences_in_test_df == 1:
        survived = train_df.loc[train_df['Ticket'] == row['Ticket']]['Survived'].iloc[0]
        if survived:
            majority_list.append(1)
        else:
            majority_list.append(-1)          

# 4. Adding Majority feature to TEST_DF:
test_df['Majority'] = majority_list

# 5. Works is done, removing Ticket feature from both dataframes:
train_df.drop('Ticket', 1, inplace=True)
test_df.drop('Ticket', 1, inplace=True)

## 3. Deleting "Fare"

In [13]:
train_df.drop('Fare', 1, inplace=True)
test_df.drop('Fare', 1, inplace=True)

## 4. "Parch" - ??? {{stored place}}

## 5. Adding "NoSibSp" and "SomeSibSp" features and deleting "SibSp" feature

In [14]:
# def construct_sib_features(dataframe):
#     #dataframe['NoSibSp'] = 0
#     #dataframe['SomeSibSp'] = 0
#     noSibSp_list = []
#     someSibSp_list = []
#     for index, row in dataframe.iterrows():
#         # SibSp = 0?
#         if row['SibSp'] == 0:
#             noSibSp_list.append(1)
#             someSibSp_list.append(0)
#         else:
#             noSibSp_list.append(0)
#             someSibSp_list.append(row['SibSp'])
#     dataframe['NoSibSp'] = noSibSp_list
#     dataframe['SomeSibSp'] = someSibSp_list
    
# construct_sib_features(train_df)
# train_df

# sns.barplot(x='SomeSibSp', y='Survived', data=train_df)
#pd.crosstab([train_df.SibSp],train_df.Survived).style.background_gradient(cmap='summer_r')
    


In [15]:
train_df['Family_Size']=0
train_df['Family_Size']=train_df['Parch']+train_df['SibSp']#family size
train_df['Alone']=0
train_df.loc[train_df.Family_Size==0,'Alone']=1#Alone

# f,ax=plt.subplots(1,2,figsize=(18,6))
# sns.factorplot('Family_Size','Survived',data=train_df,ax=ax[0])
# ax[0].set_title('Family_Size vs Survived')
# sns.factorplot('Alone','Survived',data=train_df,ax=ax[1])
# ax[1].set_title('Alone vs Survived')
# plt.close(2)
# plt.close(3)
# plt.show()

# ---
test_df['Family_Size']=0
test_df['Family_Size']=test_df['Parch']+test_df['SibSp']#family size
test_df['Alone']=0
test_df.loc[test_df.Family_Size==0,'Alone']=1#Alone
# ---

# Dropping SibSp and Parch features
train_df.drop(['SibSp', 'Parch'], 1, inplace=True)
test_df.drop(['SibSp', 'Parch'], 1, inplace=True)


## 6. Imputing Age
Note here that it is better to **use the entire dataset** for mean/median/mode calculation, otherwise we will miss out useful information. 

In [16]:
# NEWER

# 1. Obtaining the entire data:
data = pd.read_csv('train.csv')[['Name', 'Age']].append(pd.read_csv('test.csv')[['Name', 'Age']])
data.shape

# 2. Constructing Title feature dataframe
title_feature = pd.DataFrame()
title_feature['Title'] = data['Name']
# Cleaning Name feature (deleting quote marks etc., keeping only the letters)
for name_string in data['Name']:
    title_feature['Title']=data.Name.str.extract('([A-Za-z]+)\.', expand=True)
# Creating relevant mapping:
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
title_feature.replace({'Title': mapping}, inplace=True)
title_feature['Age'] = data['Age']
title_feature.groupby('Title')['Age'].median()

Title
Dr        49.0
Master     4.0
Miss      22.0
Mr        30.0
Mrs       36.0
Rev       41.5
Name: Age, dtype: float64

In [17]:
# Median imputation
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Dr'),'Age']     = 49.0
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Master'),'Age'] = 4.0
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Miss'),'Age']   = 22.0
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Mr'),'Age']     = 30.0
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Mrs'),'Age']    = 36.0
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Rev'),'Age']    = 41.5
"""
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Dr'),'Age']     = 49.0
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Master'),'Age'] = 4.0
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Miss'),'Age']   = 22.0
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Mr'),'Age']     = 30.0
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Mrs'),'Age']    = 36.0
title_feature.loc[(title_feature['Age'].isnull()) & (title_feature['Title']=='Rev'),'Age']    = 41.5
"""

# Substituting Age values in TRAIN_DF and TEST_DF:
train_df['Age'] = title_feature['Age'][:891]
test_df['Age'] = title_feature['Age'][891:]
# print("AGE NaN count:", train_df['Age'].isnull().sum())
# print("AGE NaN count:", test_df['Age'].isnull().sum())

In [8]:
# =============OLDER VERSION ===============
# =============OLDER VERSION ===============
# =============OLDER VERSION ===============
# =============OLDER VERSION ===============
# =============OLDER VERSION ===============
# =============OLDER VERSION ===============
# =============OLDER VERSION ===============
# =============OLDER VERSION ===============
# =============OLDER VERSION ===============
# =============OLDER VERSION ===============
# =============OLDER VERSION ===============
# =============OLDER VERSION ===============

last_names = train_df['Name'].apply(lambda name: name.split(',')[0])
import operator

last_names = pd.DataFrame(last_names)
name_counts = dict(last_names['Name'].value_counts())
name_counts = dict(sorted(name_counts.items(), key=operator.itemgetter(1),reverse=True))

substitution_dict = dict()
new_index = 1
for k, v in name_counts.items():
    if v >= 4:
        substitution_dict.update({k: new_index})
        new_index += 1
    else:
        substitution_dict.update({k: 0})

family_groups = last_names.replace({'Name': substitution_dict})
title_feature = pd.DataFrame()
title_feature['Title'] = train_df['Name']
for name_string in train_df['Name']:
    title_feature['Title']=train_df.Name.str.extract('([A-Za-z]+)\.', expand=True)

mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss'}
title_feature.replace({'Title': mapping}, inplace=True)

title_feature['Survived'] = train_df['Survived']
#sns.boxplot(x='Title', y='Survived', data=title_feature)
title_feature.drop('Survived', axis=1, inplace=True)
title_feature['Age'] = train_df['Age']

In [47]:
temp = pd.DataFrame()
temp['Title'] = title_feature['Title']
temp['Age'] = train_df['Age'] 

# Forming titles DFs
titles = list(set(temp['Title'].tolist())) # List of all our chosen titles
l = []
for t in titles:
    l.append(temp.loc[(temp.Age.notnull()) & (temp['Title']==t)])

# Note that we perform imputation on TRAIN_DF now!
train_df['Title'] = title_feature['Title']
train_df.loc[(train_df['Age'].isnull()) & (train_df['Title']=='Dr'),'Age']     = 46.5
train_df.loc[(train_df['Age'].isnull()) & (train_df['Title']=='Master'),'Age'] = 3.5
train_df.loc[(train_df['Age'].isnull()) & (train_df['Title']=='Miss'),'Age']   = 21.5
train_df.loc[(train_df['Age'].isnull()) & (train_df['Title']=='Mr'),'Age']     = 30.0
train_df.loc[(train_df['Age'].isnull()) & (train_df['Title']=='Mrs'),'Age']    = 35.9
train_df.loc[(train_df['Age'].isnull()) & (train_df['Title']=='Rev'),'Age']    = 46.5

# Dropping Title feature for now - because maybe it will not be needed at all, who knows.
train_df.drop('Title', axis=1, inplace=True)

print("AGE NaN count:", train_df['Age'].isnull().sum())

# Note that we perform imputation on TRAIN_DF now!
test_df['Title'] = title_feature['Title']
test_df.loc[(test_df['Age'].isnull()) & (test_df['Title']=='Dr'),'Age']     = 46.5
test_df.loc[(test_df['Age'].isnull()) & (test_df['Title']=='Master'),'Age'] = 3.5
test_df.loc[(test_df['Age'].isnull()) & (test_df['Title']=='Miss'),'Age']   = 21.5
test_df.loc[(test_df['Age'].isnull()) & (test_df['Title']=='Mr'),'Age']     = 30.0
test_df.loc[(test_df['Age'].isnull()) & (test_df['Title']=='Mrs'),'Age']    = 35.9
test_df.loc[(test_df['Age'].isnull()) & (test_df['Title']=='Rev'),'Age']    = 46.5

# Dropping Title feature for now - because maybe it will not be needed at all, who knows.
test_df.drop('Title', axis=1, inplace=True)
print("AGE NaN count:", test_df['Age'].isnull().sum())

AGE NaN count: 0
AGE NaN count: 0


## 7. Modifying Sex - replacing male/female with 0/1

In [20]:
train_df['Sex'].replace(['male','female'],[0,1],inplace=True)
test_df['Sex'].replace(['male','female'],[0,1],inplace=True)

## 8. Deleting "Name"

In [21]:
train_df.drop('Name', 1, inplace=True)
test_df.drop('Name', 1, inplace=True)

<br>
<br>
<br>
# ======================== TRAINING! ========================
<br>

## ? Preprocessing? Scaling? ##

### 0. Загрузка данных из датафрейма

In [176]:
X = train_df.drop('Survived', 1)
y = train_df['Survived']

### 1. Шкалирование в [0, 1]

In [178]:
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler(feature_range=(0, 1))
X = mm_scaler.fit_transform(X)

### 2. Нормализация

In [163]:
# from sklearn.preprocessing import StandardScaler
# std_scaler = StandardScaler().fit(X)
# X = std_scaler.transform(X)

from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)

In [135]:
#importing all the required ML packages
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix

In [156]:
X = train_df.drop('Survived', 1)
y = train_df['Survived']

In [190]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
knn = KNeighborsClassifier(n_neighbors=8, weights='distance')
scores = cross_val_score(knn, X, y, scoring='accuracy', cv=10)
print("Mean cross-validated score is:", scores.mean())

Mean cross-validated score is: 0.841769379185


In [172]:
from sklearn.linear_model import LogisticRegression
# Logistic regression
log_regression = LogisticRegression()
scores = cross_val_score(log_regression, X, y, scoring='accuracy', cv=10)
print("Mean is:", scores.mean())

Mean is: 0.838436045852


In [118]:
train,test=train_test_split(train_df,test_size=0.3,random_state=0,stratify=train_df['Survived'])
train_X=train[train.columns[1:]]
train_Y=train[train.columns[:1]]
test_X=test[test.columns[1:]]
test_Y=test[test.columns[:1]]
X=train_df[train_df.columns[1:]]
Y=train_df['Survived']

In [121]:
# SCALING AGAIN - DELETE ME IF NECESSARY
from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler(feature_range=(0, 1))
scaler = StandardScaler()
rescaledX = scaler.fit_transform(X)
X = rescaledX
test_X = scaler.fit_transform(test_X)
test_Y = scaler.fit_transform(test_Y)
train_X = scaler.fit_transform(train_X)
train_Y = scaler.fit_transform(train_Y)

In [166]:
model=svm.SVC(kernel='rbf',C=1,gamma=0.1)
model.fit(train_X,train_Y)
prediction1=model.predict(test_X)
print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction1,test_Y))

  y = column_or_1d(y, warn=True)


ValueError: Unknown label type: 'continuous'

In [123]:
model=svm.SVC(kernel='linear',C=0.1,gamma=0.1)
model.fit(train_X,train_Y)
prediction2=model.predict(test_X)
print('Accuracy for linear SVM is',metrics.accuracy_score(prediction2,test_Y))

  y = column_or_1d(y, warn=True)


ValueError: Unknown label type: 'continuous'

In [124]:
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
kfold = KFold(n_splits=10, random_state=22) # k=10, split the data into 10 equal parts
xyz=[]
accuracy=[]
std=[]
classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN','Decision Tree','Naive Bayes','Random Forest']
models=[svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),KNeighborsClassifier(n_neighbors=9),DecisionTreeClassifier(),GaussianNB(),RandomForestClassifier(n_estimators=100)]
for i in models:
    model = i
    cv_result = cross_val_score(model,X,Y, cv = kfold,scoring = "accuracy")
    cv_result=cv_result
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)
new_models_dataframe2=pd.DataFrame({'CV Mean':xyz,'Std':std},index=classifiers)       
new_models_dataframe2

Unnamed: 0,CV Mean,Std
Linear Svm,0.832747,0.028307
Radial Svm,0.851835,0.033319
Logistic Regression,0.840624,0.025045
KNN,0.846255,0.033273
Decision Tree,0.829388,0.023026
Naive Bayes,0.824931,0.028845
Random Forest,0.835044,0.018552


In [125]:
from sklearn.ensemble import VotingClassifier
ensemble_lin_rbf=VotingClassifier(estimators=[('KNN',KNeighborsClassifier(n_neighbors=10)),
                                              ('RBF',svm.SVC(probability=True,kernel='rbf',C=0.5,gamma=0.1)),
                                              ('RFor',RandomForestClassifier(n_estimators=500,random_state=0)),
                                              ('LR',LogisticRegression(C=0.05)),
                                              ('DT',DecisionTreeClassifier(random_state=0)),
                                              ('NB',GaussianNB()),
                                              ('svm',svm.SVC(kernel='linear',probability=True))
                                             ], 
                       voting='soft').fit(train_X,train_Y)
print('The accuracy for ensembled model is:',ensemble_lin_rbf.score(test_X,test_Y))
cross=cross_val_score(ensemble_lin_rbf,X,Y, cv = 10,scoring = "accuracy")
print('The cross validated score is',cross.mean())

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


ValueError: continuous is not supported

In [127]:
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier(n_estimators=200,random_state=0,learning_rate=0.1)
result=cross_val_score(ada,X,Y,cv=10,scoring='accuracy')
print('The cross validated score for AdaBoost is:',result.mean())

The cross validated score for AdaBoost is: 0.844066791511


In [80]:
from sklearn.model_selection import GridSearchCV

n_estimators=list(range(100,1100,100))
learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper={'n_estimators':n_estimators,'learning_rate':learn_rate}
gd=GridSearchCV(estimator=AdaBoostClassifier(),param_grid=hyper,verbose=True)
gd.fit(X,Y)
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  4.6min finished


0.849607182941
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=200, random_state=None)


In [128]:
X = train_df.drop('Survived', 1)
y = train_df['Survived']

model = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=200, random_state=None)
result=cross_val_score(model, X, Y, cv=10, scoring='accuracy')
print('The cross validated score for AdaBoost is:',result.mean())

model.fit(X, y)

The cross validated score for AdaBoost is: 0.847387356713


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=200, random_state=None)

In [119]:
X = test_df
x_hat = model.predict(X)

temp = pd.read_csv('test.csv')[['PassengerId', 'Sex']]
temp['Survived'] = x_hat
temp.drop('Sex', 1, inplace=True)

In [121]:
temp.to_csv("z.csv", index=False)

<br><br><br>

# АППЕНДИКС
Записать текущие **Pandas-датафреймы TRAIN_DF и TEST_DF** в файлы.

In [174]:
train_df.to_csv("train_df.csv", index=False)
test_df.to_csv("test_df.csv", index=False)