In [None]:
"""
Created on Mon Feb  7 19:08:38 2022
Updated script for titanic, more pythonic version
"""

import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor

from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, MinMaxScaler



pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)
os.chdir("H:/Dropbox/Kaggle/titanic")


#%% Import data ###

df = pd.read_csv("train.csv") # titanic_fullsample
print(df.head())
print(df.shape)

#%% Encode categorical variables ###

df.drop(columns=['Name', 'Ticket', 'Cabin'],inplace=True)
df.loc[df.Age.isnull(),'Age'] = df.Age.median()

df.loc[df.SibSp>2,'SibSp']=3
df.loc[df.Parch>2,'Parch']=3

df_uniques = pd.DataFrame([[i, len(df[i].unique())] for i in df.columns], columns=['Variable', 'Unique Values']).set_index('Variable')
df_uniques

binary_variables = list(df_uniques[df_uniques['Unique Values'] == 2].index)
categorical_variables = list(df_uniques[(6 >= df_uniques['Unique Values']) & (df_uniques['Unique Values'] > 2)].index)
numeric_variables = list(set(df.columns) - set(categorical_variables) - set(binary_variables))

lb = LabelBinarizer()
binary_variables.remove('Survived')

for column in binary_variables:
    df[column] = lb.fit_transform(df[column])

df = pd.get_dummies(df, columns = categorical_variables, drop_first=True)

print(df.head(10))

# %% scaling ###

y = df['Survived']
X = df.drop(columns=['Survived'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mm = MinMaxScaler()

for column in [numeric_variables]:
    X_train[column] = mm.fit_transform(X_train[column])
    X_test[column] = mm.fit_transform(X_test[column])

#%% Logistic regression ###

grid_values = {'penalty': ['l2'], 'C': [1,2,4,10,15,20,25,30,40,100]}
lr = LogisticRegression()
model_lr = GridSearchCV(lr, param_grid=grid_values, cv = 20)
model_lr.fit(X_train, y_train)
print(model_lr.best_score_, model_lr.best_params_)

# model_lr.predict(X_test)

#%% KNN ###

grid_values = dict(n_neighbors=np.arange(1,40))
knnm = KNeighborsClassifier()   
model_knn = GridSearchCV(knnm, param_grid=grid_values, cv = 20)
model_knn.fit(X_train, y_train)
print(model_knn.best_score_, model_knn.best_params_)


#%% SVM ###

grid_values = {'C': np.arange(0.05, 1, 0.05)} 
svmm = svm.SVC(kernel='rbf')
model_svm = GridSearchCV(svmm, param_grid=grid_values, cv = 20)
model_svm.fit(X_train, y_train)
print(model_svm.best_score_, model_svm.best_params_)

#%% RF ###

# may look here: https://www.geeksforgeeks.org/hyperparameter-tuning/


grid_values = [{'max_depth': list(range(2, 9)), 'max_features': list(np.arange(0.2,0.71,0.05))}]
rfc = RandomForestClassifier(random_state=42)
model_rf = GridSearchCV(rfc, grid_values, cv = 20, scoring='accuracy')
model_rf.fit(X_train, y_train)
print(model_rf.best_score_, model_rf.best_params_)


#%% XGBoost? ###
# run this code only on Kaggle with GPU
# see kaggle for updated code

estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

parameters = {
    'max_depth': range (2, 4, 1),
    'n_estimators': range(20, 50),
    'learning_rate': [0.05, 0.1]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 10,
    verbose=True
)

grid_search.fit(X_train, y_train)
print(grid_search.best_score_, grid_search.best_params_)





#%%  ###


A = np.arange(0.6, 1.01, 0.01)
mae_logm_ar = np.zeros(len(A))

for i in np.arange(len(A)):
    a = A[i]
    logm = LogisticRegression(C=a, solver='liblinear')
    yhat_logm = cross_val_predict(logm, X, y, cv=10)
    mae_logm_ar[i] = np.mean(np.abs(np.array(y)-yhat_logm))

mae_a = pd.DataFrame({'a': A, 'mae': mae_logm_ar}, columns=['a', 'mae'])
print(mae_a)

# C=1 seems to work best, 18.5% mae.

fullmodel = LogisticRegression(C=1, solver='liblinear')
fullmodel.fit(X,y)

#%%  ###


X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))

A = np.arange(10,40)
mae_knnm_ar = np.zeros(len(A))

for i in np.arange(len(A)):
    a = A[i]
    knnm = KNeighborsClassifier(n_neighbors = a)    
    yhat_knnm = cross_val_predict(knnm, X, y, cv=45)
    mae_knnm_ar[i] = np.mean(np.abs(np.array(y)-yhat_knnm))

mae_a = pd.DataFrame({'a': A, 'mae': mae_knnm_ar}, columns=['a', 'mae'])
print(mae_a)

# k=30 seems the best, mae aroun 20.2%.

#%%  ###


rfm = RandomForestRegressor(random_state=1, max_depth=12)
# rfm = RandomForestRegressor(random_state=1, max_depth=12, max_features='sqrt')


yhat_rfm = cross_val_predict(rfm, X, y, cv=25)
mae_rfm = np.mean(np.abs(np.array(y)-yhat_rfm))
print(mae_rfm)

# max_depth=12 seems optimal, mae=25.8%.

#%%  ###


A = np.arange(0.3, 1.1, 0.1)
mae_svmm_ar = np.zeros(len(A))

for i in np.arange(len(A)):
    a = A[i]
    svmm = svm.SVC(C=a, kernel='rbf')
    yhat_svmm = cross_val_predict(svmm, X, y, cv=21)
    mae_svmm_ar[i] = np.mean(np.abs(np.array(y)-yhat_svmm))

mae_a = pd.DataFrame({'a': A, 'mae': mae_svmm_ar}, columns=['a', 'mae'])
print(mae_a)

# at C=0.4 mae is 18.7%

fullmodel = svm.SVC(C=0.4, kernel='rbf')
fullmodel.fit(X,y)












#%%  ###


tests = pd.read_csv("test.csv") # titanic_fullsample
print(tests.head())
print(tests.shape)


tests.drop(columns=['Name', 'Ticket'],inplace=True)
tests['Sex']=(tests['Sex']=='male')*1
tests['Cabin']= ~ (tests.Cabin.isna())*1
print(tests.head(10))
print(tests.describe(include='all'))


tests['Parch0']=(tests['Parch']==0)*1
tests['Parch1']=(tests['Parch']==1)*1
tests['Parch2']=(tests['Parch']==2)*1
tests['Parch3']=(tests['Parch']>2)*1

tests['SibSp0']=(tests['SibSp']==0)*1
tests['SibSp1']=(tests['SibSp']==1)*1
tests['SibSp2']=(tests['SibSp']==2)*1
tests['SibSp3']=(tests['SibSp']>2)*1

tests['EmbarkedS']=(tests['Embarked']=='S')*1
tests['EmbarkedC']=(tests['Embarked']=='C')*1
tests['EmbarkedQ']=(tests['Embarked']=='Q')*1

tests['Pclass1']=(tests['Pclass']==1)*1
tests['Pclass2']=(tests['Pclass']==2)*1
tests['Pclass3']=(tests['Pclass']==3)*1

tests.drop(columns=['Parch', 'SibSp', 'Embarked', 'Pclass'],inplace=True)

tests['Age'][tests.Age.isna()]=tests.Age.mean()

print(tests.head(10))

#%%  ###


X = tests.copy()
X.loc[X.Fare.isna(),'Fare'] = X.Fare.mean()

yhat = fullmodel.predict(X)
threshold = np.quantile(yhat, 1-y.mean()-a)
yhat[yhat>threshold]=1
yhat[yhat<=threshold]=0
yhat = yhat.astype(int)

results = pd.DataFrame({'PassengerId': tests.PassengerId, 'Survived': yhat}, columns=['PassengerId', 'Survived'])

results.to_csv('Titanic_subm6.csv', index=False)  












## aside:creating binary variables.     df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
##                                      df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)
## creating dummies                     Feature = pd.concat([Feature,pd.get_dummies(df['education'])], axis=1)

#%%  ###



