In [1]:
# final project for IBM ML 3 Regression

import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.max_columns', 20)

In [2]:
path = '../input/titanic/train.csv'
data = pd.read_csv(path) # titanic_fullsample
print(data.head())
print(data.shape)

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
(8

In [None]:
data.drop(columns=['Name', 'Ticket'],inplace=True)
data['Sex']=(data['Sex']=='male')*1
data['Cabin']= ~ (data.Cabin.isna())*1
print(data.head(10))
print(data.describe(include='all'))
print(data.SibSp.value_counts())
print(data.Parch.value_counts())
print(data.Embarked.value_counts())
print(data.Pclass.value_counts())

data['Parch0']=(data['Parch']==0)*1
data['Parch1']=(data['Parch']==1)*1
data['Parch2']=(data['Parch']==2)*1
data['Parch3']=(data['Parch']>2)*1

data['SibSp0']=(data['SibSp']==0)*1
data['SibSp1']=(data['SibSp']==1)*1
data['SibSp2']=(data['SibSp']==2)*1
data['SibSp3']=(data['SibSp']>2)*1

data['EmbarkedS']=(data['Embarked']=='S')*1
data['EmbarkedC']=(data['Embarked']=='C')*1
data['EmbarkedQ']=(data['Embarked']=='Q')*1

data['Pclass1']=(data['Pclass']==1)*1
data['Pclass2']=(data['Pclass']==2)*1
data['Pclass3']=(data['Pclass']==3)*1

data.drop(columns=['Parch', 'SibSp', 'Embarked', 'Pclass'],inplace=True)

data['Age'][data.Age.isna()]=data.Age.mean()

print(data.head(10))

In [None]:
y = data['Survived']
X = data.drop(columns='Survived')

In [None]:

A = np.arange(0.6, 1.01, 0.01)
mae_logm_ar = np.zeros(len(A))

for i in np.arange(len(A)):
    a = A[i]
    logm = LogisticRegression(C=a, solver='liblinear')
    yhat_logm = cross_val_predict(logm, X, y, cv=10)
    mae_logm_ar[i] = np.mean(np.abs(np.array(y)-yhat_logm))

mae_a = pd.DataFrame({'a': A, 'mae': mae_logm_ar}, columns=['a', 'mae'])
print(mae_a)

# C=1 seems to work best, 18.5% mae.

fullmodel = LogisticRegression(C=1, solver='liblinear')
fullmodel.fit(X,y)

In [None]:

X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))

A = np.arange(10,40)
mae_knnm_ar = np.zeros(len(A))

for i in np.arange(len(A)):
    a = A[i]
    knnm = KNeighborsClassifier(n_neighbors = a)    
    yhat_knnm = cross_val_predict(knnm, X, y, cv=45)
    mae_knnm_ar[i] = np.mean(np.abs(np.array(y)-yhat_knnm))

mae_a = pd.DataFrame({'a': A, 'mae': mae_knnm_ar}, columns=['a', 'mae'])
print(mae_a)

# k=30 seems the best, mae aroun 20.2%.

In [None]:

rfm = RandomForestRegressor(random_state=1, max_depth=12)
# rfm = RandomForestRegressor(random_state=1, max_depth=12, max_features='sqrt')


yhat_rfm = cross_val_predict(rfm, X, y, cv=25)
mae_rfm = np.mean(np.abs(np.array(y)-yhat_rfm))
print(mae_rfm)

# max_depth=12 seems optimal, mae=25.8%.

In [None]:

A = np.arange(0.3, 1.1, 0.1)
mae_svmm_ar = np.zeros(len(A))

for i in np.arange(len(A)):
    a = A[i]
    svmm = svm.SVC(C=a, kernel='rbf')
    yhat_svmm = cross_val_predict(svmm, X, y, cv=21)
    mae_svmm_ar[i] = np.mean(np.abs(np.array(y)-yhat_svmm))

mae_a = pd.DataFrame({'a': A, 'mae': mae_svmm_ar}, columns=['a', 'mae'])
print(mae_a)

# at C=0.4 mae is 18.7%

fullmodel = svm.SVC(C=0.4, kernel='rbf')
fullmodel.fit(X,y)

In [None]:

tests = pd.read_csv("test.csv") # titanic_fullsample
print(tests.head())
print(tests.shape)


tests.drop(columns=['Name', 'Ticket'],inplace=True)
tests['Sex']=(tests['Sex']=='male')*1
tests['Cabin']= ~ (tests.Cabin.isna())*1
print(tests.head(10))
print(tests.describe(include='all'))


tests['Parch0']=(tests['Parch']==0)*1
tests['Parch1']=(tests['Parch']==1)*1
tests['Parch2']=(tests['Parch']==2)*1
tests['Parch3']=(tests['Parch']>2)*1

tests['SibSp0']=(tests['SibSp']==0)*1
tests['SibSp1']=(tests['SibSp']==1)*1
tests['SibSp2']=(tests['SibSp']==2)*1
tests['SibSp3']=(tests['SibSp']>2)*1

tests['EmbarkedS']=(tests['Embarked']=='S')*1
tests['EmbarkedC']=(tests['Embarked']=='C')*1
tests['EmbarkedQ']=(tests['Embarked']=='Q')*1

tests['Pclass1']=(tests['Pclass']==1)*1
tests['Pclass2']=(tests['Pclass']==2)*1
tests['Pclass3']=(tests['Pclass']==3)*1

tests.drop(columns=['Parch', 'SibSp', 'Embarked', 'Pclass'],inplace=True)

tests['Age'][tests.Age.isna()]=tests.Age.mean()

print(tests.head(10))

In [None]:

X = tests.copy()
X.loc[X.Fare.isna(),'Fare'] = X.Fare.mean()

yhat = fullmodel.predict(X)
threshold = np.quantile(yhat, 1-y.mean()-a)
yhat[yhat>threshold]=1
yhat[yhat<=threshold]=0
yhat = yhat.astype(int)

results = pd.DataFrame({'PassengerId': tests.PassengerId, 'Survived': yhat}, columns=['PassengerId', 'Survived'])

results.to_csv('Titanic_subm6.csv', index=False)  












## aside:creating binary variables.     df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
##                                      df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)
## creating dummies                     Feature = pd.concat([Feature,pd.get_dummies(df['education'])], axis=1)