In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pkg_resources
import types
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system names
        if name == "PIL":
            name = "Pillow"
        elif name == "sklearn":
            name = "scikit-learn"

        yield name
imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))


In [3]:
import matplotlib.pyplot as plt
import re as re
import seaborn as sns
sns.set()
%matplotlib inline 

In [4]:
train_data = pd.read_csv("../input/titanic/train.csv")
train_data.head()

In [5]:
test_data = pd.read_csv("../input/titanic/test.csv")
test_data.head()

In [6]:
# Function that define all the EDA we need 
def EDA(df):
    print("\n_____ INFO _____")
    print(df.info())
    print("\n_____ Describe _____")
    print(df.describe(include='all'))
    print("\n_____ Columns _____")
    print(df.columns)
    print("\n_____ Missing Values _____")
    print(df.isnull().sum())
    print("\n_____ NULL values _____")
    print(df.isna().sum())
    print("\n_____ Shape Of Data _____")
    print(df.shape)

In [7]:
print("__________ Train Data __________")
EDA(train_data)
print("__________ Test data __________")
EDA(test_data)

In [8]:
test_data_Passid = test_data.PassengerId
test_data_Passid[0:4]

In [9]:
# Lets drop the variables 'Cabin'(as it has too many missing values) and 'Ticket'(as it yields no meaningful infomation)
# form the Train and test data

train_data = train_data.drop(['Ticket', 'Cabin'], axis=1)
test_data = test_data.drop(['Ticket', 'Cabin'], axis=1)
combine_data = [train_data, test_data]

In [10]:
# lets find the impact of Pclass on the Survived -- in the train dataset
print(train_data[['Pclass','Survived']].groupby(['Pclass'], as_index=False).mean())

In [11]:
# lets find the impact of Sex on the Survived -- in the train dataset

print(train_data[['Sex','Survived']].groupby(['Sex'], as_index=False).mean())

In [12]:
# lets create a new feature 'Familysize' by combining the variables 'SibSp' (number of siblings and spouse)  
# and 'Parch' (number of parents and children) 

for data in combine_data:
    data['Familysize'] = data['SibSp'] + data['Parch'] +1
print(train_data[['Familysize','Survived']].groupby(['Familysize'], as_index=False).mean())    


In [13]:
# lets create a new variable 'IsAlone' to check whether the person is Alone or has Family.

for data in combine_data:
    data['IsAlone'] = 0
    data.loc[data['Familysize'] == 1, 'IsAlone'] = 1

print(train_data[['IsAlone','Survived']].groupby(['IsAlone'], as_index=False).mean())    


In [14]:
# Handling missing values for 'Age' variable in both the train and the test dataset..

for data in combine_data:
    age_avg = data['Age'].mean()
    age_std = data['Age'].std()
    age_null_count = data['Age'].isnull().sum()
    
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    data['Age'][np.isnan(data['Age'])] = age_null_random_list
    data['Age'] = data['Age'].astype('int32')   
    
    train_data['CategoricalAge'] = pd.cut(train_data['Age'], 5)

print (train_data[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())

In [15]:
# Test data has one missing Fare value.. lets fill it up using mean() value.

avg_Fare_test = test_data['Fare'].astype('float64').mean(axis=0)
test_data['Fare'].replace(np.nan, avg_Fare_test, inplace=True)

for data in combine_data:
    train_data['CategoricalFare'] = pd.qcut(train_data['Fare'], 5)
print (train_data[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean())
    


In [16]:
# lets work on the variable Embarked ... the train dataset has couple of missing values so lets fill themup.
top_port = train_data['Embarked'].value_counts().idxmax()

for data in combine_data:
    data['Embarked'] = data['Embarked'].fillna(top_port) 
print (train_data[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())


In [17]:
# Mapping Age
for data in combine_data:
    
    data.loc[ data['Age'] <= 16, 'Age']                       = 0
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
    data.loc[ data['Age'] > 64, 'Age']                        = 4


In [18]:
# Mapping Fare
for data in combine_data:
    
    data.loc[ data['Fare'] <= 7.91, 'Fare']                            = 0
    data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare'] = 1
    data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare']   = 2
    data.loc[ data['Fare'] > 31, 'Fare']                               = 3
    data['Fare'] = data['Fare'].astype(int)

In [19]:
train_data['Name']

In [20]:
# lets work with the Extracting the Titles from the Names

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

for data in combine_data:
    data['Title'] = data['Name'].apply(get_title)

print(pd.crosstab(train_data['Title'], train_data['Sex']))

In [21]:
for data in combine_data:
    data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
                                           'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')

print (train_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())

In [22]:
# Mapping titles
for data in combine_data:
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    data['Title'] = data['Title'].map(title_mapping)
    data['Title'] = data['Title'].fillna(0)

In [23]:
train_data.isnull().sum()

In [24]:
test_data.isnull().sum()

In [25]:
train_data.head()

In [26]:
test_data.head()

In [27]:
# lets clean the datasets..
drop_cols = ['PassengerId','Name','SibSp','Parch','Familysize']
train_data = train_data.drop(drop_cols, axis=1)
train_data = train_data.drop(['CategoricalAge','CategoricalFare'], axis=1)

test_data = test_data.drop(drop_cols, axis=1)


In [28]:
train_data.head()

In [29]:
test_data.head()

In [30]:
#Lets check the Object type variables in Train and Test datasets..

train_data["Pclass"] = train_data["Pclass"].astype("object")
test_data["Pclass"] = test_data["Pclass"].astype("object")

object_cols =[col for col in train_data.columns if train_data[col].dtype == "object"]
#object_cols_list = list(object_cols[object_cols].index) 
print("List of catgorical variables:")
print(object_cols)

In [31]:
#Lets apply one hot encoding to the Object type variables

from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore',dtype='int', sparse=False)

train_data_OH_cols = pd.DataFrame(OH_encoder.fit_transform(train_data[object_cols]))
test_data_OH_cols = pd.DataFrame(OH_encoder.transform(test_data[object_cols]))

train_data_OH_cols.index = train_data.index
test_data_OH_cols.index = test_data.index

num_train_data = train_data.drop(object_cols,axis=1)
num_test_data = test_data.drop(object_cols,axis=1)

OH_train_data = pd.concat([num_train_data, train_data_OH_cols], axis=1)
OH_test_data = pd.concat([num_test_data, test_data_OH_cols], axis=1)

In [32]:
OH_train_data.head()

In [33]:
OH_test_data.head()

In [34]:
OH_train_data.columns = ['Survived', 'Age', 'Fare', 'IsAlone','Title', 'Pclass 1', 'Pclass 2', 'Pclass 3', 'Male', 'Female', 'E_C','E_Q','E_S']
OH_train_data.head()


In [35]:
OH_test_data.columns = ['Age', 'Fare', 'IsAlone','Title', 'Pclass 1', 'Pclass 2', 'Pclass 3', 'Male', 'Female', 'E_C','E_Q', 'E_S']
OH_test_data

In [36]:
X = OH_train_data.iloc[:,1:]
X.head()

In [37]:
y = OH_train_data.iloc[:,0]
y.head()

In [38]:
X.shape, y.shape

### Classifier comparison and Model selection

In [39]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression


classifiers = [
    KNeighborsClassifier(10),
    SVC(probability=True),
    DecisionTreeClassifier(criterion='entropy'),
    RandomForestClassifier(criterion='entropy'),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),    
    GaussianNB(),    
    LogisticRegression()]

log_cols = ["Classifier", "Accuracy"]
log      = pd.DataFrame(columns=log_cols)

acc_dict = {}

skf = StratifiedKFold(n_splits=10,shuffle=True, random_state=42)

for train_index, test_index in skf.split(X, y):    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]    
    
    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        clf_pred = clf.predict(X_test)
        acc_score = accuracy_score(y_test, clf_pred)
        if name in acc_dict:
            acc_dict[name] += acc_score
        else:
            acc_dict[name] = acc_score

            
for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf] / 9.0
    log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
    log = log.append(log_entry)
    
print(log.sort_values(by=['Accuracy'], ascending=False))
    
plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')

sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")            
            

In [40]:
from sklearn.model_selection import GridSearchCV

est = SVC(probability=True)

p_dist = {
            'C'  :[.01, .1, .5, 1.0],
            'kernel' :['linear', 'poly', 'rbf', 'sigmoid'],
            'degree' :[3, 4, 5]
          }
                         

def hp_tune(est, p_dist, X, y):
    gridsearch = GridSearchCV(est, param_grid=p_dist, scoring='accuracy', n_jobs=-1, cv=4)
    gridsearch.fit(X, y)
    hp_tune_params = gridsearch.best_params_
    hp_tune_score = gridsearch.best_score_
    return hp_tune_params, hp_tune_score

svc_params, svc_score = hp_tune(est, p_dist, X, y)
print(svc_params)
print(svc_score)
    

In [41]:
svc_clf = SVC(C=0.5, degree=3, kernel='poly', probability=True, random_state=42)                        

In [42]:
svc_clf.fit(X, y)

In [43]:
# save the model to disk
import pickle

pickle.dump(svc_clf, open("svc_pkl.pkl", "wb")) 

In [44]:
test_pred = svc_clf.predict(OH_test_data)

In [45]:
Titanic_pred = pd.DataFrame({'PassengerId': test_data_Passid,
                       'Survived': test_pred})

Titanic_pred.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")