In [1]:
# Marco Montez
# GitHub: MarcoAntonioMontez
# 2019/2020

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from plotly.offline import iplot
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

###Define the problem
#Predict which people survived the titanic voyage

###Gather the data
train_raw = pd.read_csv("Data/train.csv").set_index("PassengerId")
# display(train_raw.head())


ModuleNotFoundError: No module named 'plotly'

In [None]:
###Data Greeting
print(train_raw.shape)
print(train_raw.info())

In [None]:
#### Data Wrangling
train=train_raw.copy()


##Correcting - Removing outliers and bad columns
##
train.drop(columns=['Cabin','Ticket'],inplace=True)


## Completing missing values
##
train['Age'] = train['Age'].fillna(train['Age'].median())
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].value_counts().idxmax())
# .mode()[0]


## Creating new features
##
#Label encoding
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
train['Title'] = train['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
# display(train['Title'].value_counts())
stat_min = 10
title_names = (train['Title'].value_counts() < stat_min)
train['Title']= train['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
train['Title'] = le.fit_transform(train['Title'])
train['Embarked'] = le.fit_transform(train['Embarked'])

#bin encoding
train['Fare Quintile']= pd.qcut(train['Fare'],q=5,labels=range(0,5))
train['Age Bin']= pd.cut(train['Age'],bins=[0,10,20,40,60,100],labels=range(0,5))

#One Hot Encoding
oneHot= pd.get_dummies(train['Embarked'],prefix='Embarked ')
train = pd.concat([train, oneHot], axis=1)

#Agregating
train['Family Size'] = train['SibSp'] + train['Parch']


## Converting datatypes
##
train['Fare Quintile'] = train['Fare Quintile'].astype(int)
train['Age Bin']= train['Age Bin'].astype(int)

print('Raw Dataframe')
display(train_raw.head(5))
print('\nModified Dataframe')
display(train.head(5))


# # display(train.info())




In [None]:
#Pair Plot of features with respect to survived
# cols = train.columns.to_list()
# cols.remove('Survived')
# sns.pairplot(train,hue='Survived',vars=cols,plot_kws={'alpha':0.5})

In [None]:
fig = plt.figure(figsize = (18,6))

plt.subplot2grid((2,2),(0,0))
train['Survived'][train['Sex']==0].value_counts(normalize=True).sort_index().plot(kind='bar',alpha=0.5)
plt.title("Man survived")
# plt.show()

plt.subplot2grid((2,2),(0,1))
train['Survived'][train['Sex']==1].value_counts(normalize=True).sort_index().plot(kind='bar',alpha=0.5)
plt.title("Woman survived")
# plt.show()

plt.subplot2grid((2,2),(1,0),colspan=2)
for x in [1,2,3]:
    train['Survived'][train['Pclass']==x].plot(kind='kde',alpha=1)
    plt.title("Class survived")
plt.legend(('1','2','3'))
plt.show()

fig = plt.figure(figsize = (18,6))
plt.subplot2grid((1,3),(0,0))
train['Survived'][train['Pclass']==1].value_counts(normalize=True).sort_index().plot(kind='bar',alpha=0.5)                                                                       
plt.title("Rich People survived")
# plt.show()

plt.subplot2grid((1,3),(0,1))
train['Survived'][train['Pclass']==2].value_counts(normalize=True).sort_index().plot(kind='bar',alpha=0.5)
plt.title("Medium class people survived")
# plt.show()

plt.subplot2grid((1,3),(0,2))
train['Survived'][(train['Pclass']==1)&(train['Sex']==1)].value_counts(normalize=True).sort_index().plot(kind='bar',alpha=0.5)
plt.title("Rich Women")
# plt.show()

plt.show()

# sns.violinplot(x="Survived", y="Pclass", data=train)

In [None]:
### Cross Validation techniques

import warnings
warnings.filterwarnings("ignore")

from statistics import mean
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit


#Separing into independent (X) and dependent(Y) variables
X = train.drop(['Name','Age','SibSp','Fare','Survived'],axis=1).values
y = train.Survived.values

#Simple train test split
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2, random_state = 0)

clf = LogisticRegression().fit(xTrain, yTrain)
train_predictions = clf.predict(xTest)
acc = accuracy_score(yTest, train_predictions)
display(acc)

#K-Fold
print('\nK-Fold')
kf = KFold(n_splits=5)
avg_acc=[]
for train_index, test_index in kf.split(X):
    xTrain, yTrain = X[train_index] , y[train_index]
    xTest, yTest = X[test_index] , y[test_index]
    
    clf = LogisticRegression().fit(xTrain, yTrain)
    train_predictions = clf.predict(xTest)
    acc = accuracy_score(yTest, train_predictions)  
    avg_acc.append(acc)
    print('Acc '+ str(acc))
print('\nAverage Accuracy '+ str(mean(avg_acc)))


#Shuffle split
print('\nShuffle split')
ss = ShuffleSplit(n_splits=5, random_state=0, test_size=0.2, train_size=None)
avg_acc=[]
for train_index, test_index in ss.split(X):
    xTrain, yTrain = X[train_index] , y[train_index]
    xTest, yTest = X[test_index] , y[test_index]
    
    clf = LogisticRegression().fit(xTrain, yTrain)
    train_predictions = clf.predict(xTest)
    acc = accuracy_score(yTest, train_predictions)  
    avg_acc.append(acc)
    print('Acc '+ str(acc))
print('\nAverage Accuracy '+ str(mean(avg_acc)))

#stratified K-Fold
print('\nStratified K-Fold')
skf = StratifiedKFold(n_splits=5)
avg_acc=[]
for train_index, test_index in skf.split(X,y):
    xTrain, yTrain = X[train_index] , y[train_index]
    xTest, yTest = X[test_index] , y[test_index]
    
    clf = LogisticRegression().fit(xTrain, yTrain)
    train_predictions = clf.predict(xTest)
    acc = accuracy_score(yTest, train_predictions)  
    avg_acc.append(acc)
    print('Acc '+ str(acc))
print('\nAverage Accuracy '+ str(mean(avg_acc)))


#Stratified Shuffle split
print('\nStratified Shuffle split')
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
avg_acc=[]
for train_index, test_index in sss.split(X,y):
    xTrain, yTrain = X[train_index] , y[train_index]
    xTest, yTest = X[test_index] , y[test_index]
    
    clf = LogisticRegression().fit(xTrain, yTrain)
    train_predictions = clf.predict(xTest)
    acc = accuracy_score(yTest, train_predictions)  
    avg_acc.append(acc)
    print('Acc '+ str(acc))
print('\nAverage Accuracy '+ str(mean(avg_acc)))

# display(train_predictions)


In [None]:


# from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
# from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score



sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

df1 = train

X=df1.drop(['Name','Age','SibSp','Fare','Survived'],axis=1).values
y=df1.Survived.values

test_acc = []


for train_index, test_index in sss.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = LogisticRegressionCV()
    #clf = RandomForestClassifier(max_depth=10, random_state=0,class_weight={0:1,1:1},n_jobs=-1)
    clf.fit(X_train, y_train)
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print(acc)
    test_acc.append(acc)
    
print('\n\nAvg Accuracy: ' + str(sum(test_acc)/len(test_acc)))
