<a href="https://colab.research.google.com/github/NglQ/KaggleChallenges/blob/main/titanicPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

In [3]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c titanic

In [None]:
!pip install skfeature-chappers

In [None]:
from sklearn import feature_selection
from numpy.core.numeric import NaN
import numpy as np
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel, mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

pd.set_option('max_columns', None)

def importDataset(stringPath):
  return pd.read_csv(stringPath)

def splitTrainVal(dataset):
  datasetTrain, datasetVal = train_test_split(dataset, test_size=0.2)
  return datasetTrain, datasetVal

def splitDataset(datasetTrain):
  features = datasetTrain.copy().drop(columns=['Survived','Ticket','PassengerId', 'Cabin'])
  target = datasetTrain.copy()['Survived']    
  return features, target


def featureEngineering(featuresE):
  Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
  }

  featuresE['Title'] = featuresE['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
  featuresE['Title'] = featuresE.Title.map(Title_Dictionary)

  titles = pd.get_dummies(featuresE["Title"], prefix='Title')

  s = pd.get_dummies(featuresE["Sex"], prefix='Sex')
  pclass = pd.get_dummies(featuresE["Pclass"], prefix='Pclass')
  e = pd.get_dummies(featuresE["Embarked"], prefix='Embarked')

  featuresE['Family'] = featuresE['SibSp'] + featuresE['Parch']
  featuresE['Fare_Person'] = featuresE['Fare'] / (featuresE['Family'] + 1)

  featuresE = featuresE.drop(columns=['Sex','Pclass','Embarked', 'SibSp', 'Parch', 'Fare','Name'])
  # features = features.drop(columns=['Sex','Pclass','Embarked', 'SibSp', 'Parch'])
  # features = features.drop(columns=['Sex','Pclass','Embarked'])
  featuresE = featuresE.join(s)
  featuresE = featuresE.join(pclass)
  featuresE = featuresE.join(e)
  featuresE = featuresE.join(titles)
  featuresE = featuresE.drop(columns = 'Title')
  featuresE = featuresE.drop(columns= 'Sex_male')

  y = featuresE.copy().loc[featuresE['Age'].isna()]
  y = y.drop(columns='Age')
  newDataset = featuresE.copy()
  newDataset['Fare_Person'] = newDataset['Fare_Person'].fillna(featuresE.median(skipna = True)['Fare_Person'])
  newDataset = newDataset.dropna() 

  newTarget = newDataset['Age']
  newFeatures = newDataset.drop(columns = 'Age')
  neigh = KNeighborsRegressor(n_neighbors=3)
  neigh.fit(newFeatures, newTarget)
  outY = neigh.predict(y)
  y['Age'] = outY
  featuresE = newDataset.append(y, ignore_index = True)
  
  return featuresE

def computeScores(features, target):
  print(features.describe(include = "all"))

  print("correlation:")
  print(features.corr())
  plt.matshow(features.corr())
  plt.show()

  print("correlation with 'survived': ")
  corrDict = {key: abs(value) for key, value in zip(features.columns, features.corrwith(target))}
  print(dict(sorted(corrDict.items(), key=lambda item: item[1], reverse=True)))

  print("mutual information:")
  mic = mutual_info_classif(features, target, discrete_features=True)
  dicMic = {key: value for key, value in zip(features.columns, mic)}
  print(dict(sorted(dicMic.items(), key=lambda item: item[1], reverse=True)))

  print("Linear regression score: ")
  lr = LinearRegression()
  rfe = RFE(estimator=lr, n_features_to_select=1, step=1)
  rfe.fit(features, target)
  rfeDict = {key: value for key, value in zip(features.columns, rfe.ranking_)}
  print(dict(sorted(rfeDict.items(), key=lambda item: item[1], reverse=True)))

  print("SVM score: ")
  svc = SVC(kernel="linear", C=1)
  rfeSvc = RFE(estimator=svc, n_features_to_select=1, step=1)
  rfeSvc.fit(features, target)
  rfeSvcDict = {key: value for key, value in zip(features.columns, rfeSvc.ranking_)}
  print(dict(sorted(rfeSvcDict.items(), key=lambda item: item[1], reverse=True)))

  print("chi^2 score: ")
  X_new_withfitTransform = SelectKBest(chi2, k=len(features.columns)).fit(features, target)
  col = X_new_withfitTransform.get_feature_names_out(features.columns)
  print(col)

  print("Tree score: ")
  clf = ExtraTreesClassifier(n_estimators=len(features.columns))
  clf = clf.fit(features, target)
  clfDict = {key: value for key, value in zip(features.columns, clf.feature_importances_)}
  print(dict(sorted(clfDict.items(), key=lambda item: item[1], reverse=True)))

def trainModel(features, target):
  clf = RandomForestClassifier(max_depth=3, random_state=0)
  clf.fit(features, target)
  return clf

def makePredictions(model, testSetFeatures):
  return model.predict(testSetFeatures)

def writeCsv(idx, predictions):
  filepath = Path('outTitanic.csv') 
  outDf = pd.DataFrame({'PassengerId':idx, 'Survived':predictions})
  outDf.to_csv(filepath, index = False, header=True)


if __name__ == '__main__':
  dataset = importDataset('train.csv')
  dataset_test = importDataset('test.csv')
  features, target = splitDataset(dataset)
  features = featureEngineering(features)
  computeScores(features, target)
  features_test = dataset_test.copy().drop(columns=['Ticket','PassengerId', 'Cabin'])
  features_test = featureEngineering(features_test)
  features_test['Title_Royalty'] = np.zeros(len(features_test))
  model = trainModel(features, target)
  predictions = makePredictions(model, features_test)
  writeCsv(dataset_test['PassengerId'], predictions)
