## LOAD DATA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder


"""
survival = 0(No) / 1(yes)
pclass = [1: 1sr class, 2: 2nd class, 3: 3rd class]
sex = M / F
Age --
sibsp = The number of siblings and/or spouses aboard the titanic
parch = The number of parents and/or children on the titanic
ticket = The ticket number
fare = The passenger fare
cabin = The cabin number
embarked = The port of Embarkation [C=Cherbourg, Q=Queenstown, S=Southampton]

"""



'\nsurvival = 0(No) / 1(yes)\npclass = [1: 1sr class, 2: 2nd class, 3: 3rd class]\nsex = M / F\nAge --\nsibsp = The number of siblings and/or spouses aboard the titanic\nparch = The number of parents and/or children on the titanic\nticket = The ticket number\nfare = The passenger fare\ncabin = The cabin number\nembarked = The port of Embarkation [C=Cherbourg, Q=Queenstown, S=Southampton]\n\n'

In [2]:
all_data =pd.read_csv("train.csv", delimiter=",")

training_set, testing_set = train_test_split(all_data, test_size=0.2, random_state=42)

training_y = training_set["Survived"]
training_X = training_set.drop(["Survived", "PassengerId", "Name", "Ticket", "Cabin"], axis=1)

testing_y = testing_set["Survived"]
testing_X = testing_set.drop(["Survived", "PassengerId", "Name", "Ticket", "Cabin"], axis=1)

In [3]:
testing_set.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
709,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S
840,841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S
720,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S
39,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C
290,291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26.0,0,0,19877,78.85,,S
300,301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q
333,334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16.0,2,0,345764,18.0,,S
208,209,1,3,"Carr, Miss. Helen ""Ellen""",female,16.0,0,0,367231,7.75,,Q
136,137,1,1,"Newsom, Miss. Helen Monypeny",female,19.0,0,2,11752,26.2833,D47,S


## PREPROCESING

In [4]:
# Step 1: handle missing data
#    Imputer will populate missing values for numerical data
#    Std_Scaler will scale values so to be in similar ranges

numerical_columns = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
categorical_columns = ["Sex", "Embarked"]

numerical_pipeline = Pipeline([("Imputer", SimpleImputer(strategy="median")),
                              ("Std_Scaler", StandardScaler())])


In [5]:
training_X["Embarked"].fillna(training_X["Embarked"].value_counts().keys()[0], inplace=True)
testing_X["Embarked"].fillna(testing_X["Embarked"].value_counts().keys()[0], inplace=True)

In [6]:
full_pipeline = ColumnTransformer([("numerical", numerical_pipeline, numerical_columns),
                                  ("categorical", OneHotEncoder(), categorical_columns)])
preprocessed_training_data = full_pipeline.fit_transform(training_X)
preprocessed_testing_data = full_pipeline.transform(testing_X)

In [7]:
processed_training_X = pd.DataFrame(preprocessed_training_data)
processed_testing_X = pd.DataFrame(preprocessed_testing_data)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(processed_training_X, training_y)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [9]:
predictions = clf.predict(processed_testing_X)

In [10]:
y_pred = cross_val_predict(clf, processed_training_X, training_y, cv=3)

In [11]:
confusion = confusion_matrix(training_y, y_pred)
confusion

array([[376,  68],
       [ 78, 190]], dtype=int64)

In [12]:
f1 = f1_score(training_y, y_pred)
f1

0.7224334600760457

In [15]:
param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(processed_training_X, training_y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  9.4min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,
                                10000.0, 30000.0],
                          'kernel': ['linear']},
                         {'C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0],
                          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
                          'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=2)

In [16]:
grid_search.best_params_

{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}

In [19]:
classifier = SVR(kernel="rbf", gamma=0.1, C=1.0)
classifier.fit(processed_training_X, training_y)

y_pred = cross_val_predict(classifier, processed_training_X, training_y, cv=3)


In [20]:
f1 = f1_score(training_y, y_pred)
f1

ValueError: Classification metrics can't handle a mix of binary and continuous targets