In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.linear_model import RidgeClassifierCV

In [2]:
os.chdir('E:\\Machine learning\\titanic')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_id = pd.DataFrame(train['PassengerId'])
test_id = pd.DataFrame(test['PassengerId'])

full_data = [train, test]

for data in full_data:
    data['Has_Cabin'] = data["Cabin"].apply(lambda x: 0 if type(x) == float else 1) # if cabin is alloted or not.
    
    data['family_count'] = data['SibSp'] + data['Parch'] + 1
    
    data['alone'] = 0
    data.loc[data['family_count'] == 1,'alone'] = 1                                 # if the person is alone
    
    data['Embarked'] = data['Embarked'].fillna('S')
    
    null_age_train = np.random.randint(data['Age'].mean() - data['Age'].std(),
                                       data['Age'].mean() + data['Age'].std(),
                                       size=data['Age'].isnull().sum())             # random null age values
    data['Age'][np.isnan(data['Age'])] = null_age_train
    
    data['Age'] = pd.qcut(data['Age'],[0,0.25,0.50,0.75,1.0])
    
    data['Fare'] = data['Fare'].fillna(14)
    
    data['Fare'] = pd.qcut(data['Fare'],[0,0.25,0.50,0.75,1.0])
           
    data['Title'] = data['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt','Mme', 'Ms',
                                           'Mlle', 'Col','Don','Dr', 'Major', 'Rev',
                                           'Sir', 'Jonkheer', 'Dona','the Countess'], 'Rare')
    
    
    data.drop(['PassengerId', 'Name', 'Ticket','Cabin','SibSp'],inplace = True,axis = 1)
    


    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
x_train = train.drop(['Survived'],axis=1)
y_train = train.drop(x_train,axis=1)
y_train = np.ravel(y_train)
x_train = pd.get_dummies(x_train)

In [10]:
def score(model,x):
    acc = model.score(x,y_train)                                                                  #accuracy  
    a = pd.DataFrame(model.predict(x))
    a[1] = y_train
    r = a[(a[0] == a[1]) & (a[0] == 1)].shape[0] / a[a[1] == 1].shape[0]                          #recall
    p = a[(a[0] == a[1]) & (a[0] == 1)].shape[0] / a[a[0] == 1].shape[0]                          #precision
    f1 = (2*p*r)/(p+r)                                                                            #f1 score 
    
    return([acc*100,r*100,p*100,f1*100])

In [11]:
    
model_1 = RandomForestClassifier(n_estimators=100,oob_score=True,max_features="sqrt",random_state=200)
model_1.fit(x_train,y_train)

model_2 = RidgeClassifierCV(alphas=(0.01,0.1,1.0),fit_intercept=True,normalize=True,cv=5)
model_2.fit(x_train,y_train)

model_3 = GradientBoostingClassifier(n_estimators=700,random_state=400)
model_3.fit(x_train,y_train)

model_4 = AdaBoostClassifier(n_estimators=700,random_state=400)
model_4.fit(x_train,y_train)

model1_proba = pd.DataFrame(model_1.predict_proba(x_train)[:,1])
model1_proba = model1_proba[0].map(lambda x: int(x*10) if x != 1 else 1)

model2_pred = pd.DataFrame(model_2.predict(x_train))           #Ridge classifier belongs to classs of linear models,
                                                              #so parameter is different
model3_proba = pd.DataFrame(model_3.predict_proba(x_train)[:,1])
model3_proba = model3_proba[0].map(lambda x: int(x*10) if x != 1 else 1)

model4_proba = pd.DataFrame(model_4.predict_proba(x_train)[:,1])
model4_proba = model4_proba[0].map(lambda x: int(x*10) if x != 1 else 1)

final_data = pd.DataFrame({'Random Forest': np.ravel(model1_proba), 'Linear regression':np.ravel(model2_pred),
                           'GradientBoosting':np.ravel(model3_proba),'AdaBoost':np.ravel(model4_proba)})

model_5= RandomForestClassifier(n_estimators=100,oob_score=True,max_features="sqrt",random_state=200)
model_5.fit(final_data,y_train)                               #Final model based on results of all models.

result = pd.DataFrame({'Overall':np.ravel(score(model_5,final_data)), 
                       'Random Forest': np.ravel(score(model_1,x_train)),
                       'Linear regression':np.ravel(score(model_2,x_train)),
                       'GradientBoosting':np.ravel(score(model_3,x_train)),
                       'AdaBoost':np.ravel(score(model_4,x_train))}, 
                       index={'Accuracy','Recall','Precision','F1 Score'})

result

Unnamed: 0,Overall,Random Forest,Linear regression,GradientBoosting,AdaBoost
F1 Score,90.572391,90.572391,83.72615,89.674523,82.940516
Accuracy,83.040936,83.040936,75.146199,81.578947,76.608187
Recall,91.612903,91.612903,81.072555,90.584416,78.443114
Precision,87.116564,87.116564,77.996965,85.846154,77.514793


In [58]:
x_test = pd.get_dummies(test)
final_y = pd.DataFrame({'PassengerID':np.ravel(test_id),'Survuval':np.ravel(model_1.predict(x_test))})
final_y.to_csv('Titanic Submission_2.csv',header=['PassengerId','Survived'],index=False)
