In [1]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.externals import joblib




sns.set(style="ticks", color_codes=True)



In [2]:
titanic = pd.read_csv("datasets/titanic/train.csv")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
titanic["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

#### data is mostly balanced so i'll use accuracy as a metric

### Exploratery data analysis
TODO ... 

### Data cleaning

In [6]:
X = titanic.drop('Survived',axis=1)
Y = titanic['Survived']

In [7]:
from sklearn.model_selection import train_test_split
X_train,x_test,Y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [9]:
num_pipeline = Pipeline(steps=[
    ("select_numerator",DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
    ("imputer",SimpleImputer(strategy='median')),
    ("scaler",StandardScaler())
])
cat_pipeline = Pipeline(steps=[
    ("select_categorical",DataFrameSelector(["Pclass", "Sex", "Embarked"])),
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'))
])

preprocessing = FeatureUnion(transformer_list=[
    ("num_pipeline",num_pipeline),
    ("cat_pipeline",cat_pipeline)
])
#lw msh ht3bt parameter el attributes est5dm featureUnion 
#lw htb3t num_attribe lel pipeline el awl wel cat_attrib lel pipeline el tany est5dm ColumnTransformer

### Training a models

In [16]:
logistic_pipeline = Pipeline(steps=[
    ('preprocess',preprocessing),
    ('logistic_reg',LogisticRegression())
])
logistic_pipeline.fit(X,Y)
y_pred = logistic_pipeline.predict(x_test)
print(accuracy_score(y_test,y_pred))

0.7821229050279329


In [17]:
X.shape

(891, 11)

In [18]:
from sklearn.svm import SVC
svc_pipeline = Pipeline(steps=[
    ('preprocess',preprocessing),
    ('svc',SVC(gamma="auto"))
])
score = cross_val_score(svc_pipeline, X_train, Y_train, cv=10) 
acc = score.mean()
print(acc)

0.8314749608763693


In [19]:
#because i have test data seperated in another file i'll use only X,y with cv instead of X_train,X_test

classifiers = [
    KNeighborsClassifier(4),
    LogisticRegression(n_jobs=-1),
    SGDClassifier(max_iter=1000,penalty='l2' ,tol=1e-3),
    LinearSVC(C=10, loss="hinge"),
    SVC(kernel="poly", degree=3, coef0=1, C=5),
    SVC(kernel="rbf", gamma =5 ,C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    VotingClassifier(estimators=[
        ('lr', LogisticRegression()), 
        ('rf', RandomForestClassifier()), 
        ('svc', SVC(kernel="rbf", gamma =5 ,C=0.025, probability=True))],
        voting='hard'),
    ]
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessing', preprocessing),
                      ('classifier', classifier)])  
    scores = cross_val_score(pipe, X, Y, cv=10) 
   
    '''
    hint:
    cv =5,means train= 60%,test=60%,validation=60%
    if you are not going to use cross_val_score
    replace scores line with
    pipe.fit(X_train,Y_train,scoring="neg_mean_squared_error")
    '''
    
    print(classifier)
    print("model score:",scores.mean())
#     print("model score",accuracy_score(pipe.predict(x_test),y_test))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')
model score: 0.8013982521847691
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
model score: 0.7991260923845193
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
m



LinearSVC(C=10, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.0001, verbose=0)
model score: 0.786729088639201
SVC(C=5, break_ties=False, cache_size=200, class_weight=None, coef0=1,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
model score: 0.8171285892634208
SVC(C=0.025, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=5, kernel='rbf', max_iter=-1,
    probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
model score: 0.616167290886392
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0,

In [None]:
#SVC wins over RandomForest classiffiers
#let's do RandomizedSearchCV on SVC

### RandomizedSearchCV on RandomForestClassifier

In [None]:
from scipy.stats import expon, reciprocal

svc_pipeline = Pipeline(steps=[
    ('preprocess',preprocessing),
    ('SVC_clf',SVC())
])


param_distribs = [{
        'SVC_clf__kernel': ['linear', 'rbf'],
        'SVC_clf__C': reciprocal(20, 200000),
        'SVC_clf__gamma': expon(scale=1.0),
}]

rnd_search = RandomizedSearchCV(svc_pipeline, param_distributions=param_distribs,
                                n_iter=10, cv=10, random_state=42)
rnd_search.fit(X, Y)

In [None]:
rnd_search.best_params_

In [None]:
rnd_search.best_estimator_

In [None]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

#### Accuracy = 81.7% thats great

### Save the model

In [None]:
joblib.dump(CV.best_estimator_, "Titanic_best_model.pkl")

In [None]:
#Load RandomForest Model
svc_titanic_loaded = joblib.load("Titanic_best_model.pkl")
svc_titanic_loaded