In [None]:
import pandas as pd
import numpy as np
import random as rd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

In [None]:
print(train_df.info())
print('\n------\n')
print(test_df.info())

In [None]:
total_null = train_df.isnull().sum()
total = train_df.isnull().count()
percent = total_null/total*100
print(percent)

In [None]:
women = train_df[train_df['Sex']=='female']
men = train_df[train_df['Sex']=='male']

msurv = men[men['Survived']==1]
mnsurv = men[men['Survived']==0]
fsurv = women[women['Survived']==1]
fnsurv = women[women['Survived']==0]

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,5))
sns.distplot(msurv.Age.dropna(),kde=False,label='Men survived',ax=axes[0]).legend()
sns.distplot(mnsurv.Age.dropna(),kde=False,label='Men not survived',ax=axes[0]).legend()

sns.distplot(msurv.Age.dropna(),kde=False,label='Men survived',ax=axes[1]).legend()
sns.distplot(mnsurv.Age.dropna(),kde=False,label='Men not survived',ax=axes[1]).legend()

In [None]:
sns.barplot(x='Pclass',y='Survived',data=train_df)
grid=sns.FacetGrid(train_df,row='Pclass',col='Survived')
grid.map(plt.hist,'Age')

In [None]:
train_df['relatives'] = train_df['SibSp'] + train_df['Parch']
train_df['not_alone'] = train_df.relatives.astype(bool).astype(int)
test_df['relatives'] = test_df['SibSp'] + test_df['Parch']
test_df['not_alone'] = test_df.relatives.astype(bool).astype(int)
print(train_df['not_alone'].value_counts())
print(test_df['not_alone'].value_counts())

In [None]:
ds = [train_df, test_df]
mean = train_df['Age'].mean()
std = train_df['Age'].std()
for d in ds:
    null_data = d['Age'].isnull().sum()
    dcopy = d['Age'].copy()
    rand_dig = np.random.randint(mean-std, mean + std, size = null_data)
    dcopy[np.isnan(dcopy)] = rand_dig
    d['Age1'] = dcopy.astype(int)

train_df['Age1']
test_df['Age1']

In [None]:
train_df['Embarked'].describe()
ds = [train_df, test_df]
for d in ds:
    d['Embarked'] = d['Embarked'].fillna('S')

In [None]:
train_df.info()

In [None]:
ds = [train_df,test_df]
for d in ds:
    d['Fare'] = d['Fare'].fillna(0)

In [None]:
genders = {'male':0, 'female':1}
ds = [train_df,test_df]
for d in ds:
    d['Sex1'] = d['Sex'].map(genders)

In [None]:
scq = {'S':0, 'C':1, 'Q':2}
ds = [train_df,test_df]
for d in ds:
    d['Embarked1'] = d['Embarked'].map(scq)

In [None]:
train_df.info()

In [None]:
ds = [train_df,test_df]
for d in ds:
    d.loc[(d['Age1'] <= 11), 'Age1'] = 0
    d.loc[(d['Age1'] > 11) & (d['Age1'] <= 20), 'Age1'] = 1
    d.loc[(d['Age1'] > 20) & (d['Age1'] <= 30), 'Age1'] = 2
    d.loc[(d['Age1'] > 30) & (d['Age1'] <= 40), 'Age1'] = 3
    d.loc[(d['Age1'] > 40) & (d['Age1'] <= 50), 'Age1'] = 4
    d.loc[(d['Age1'] > 50) & (d['Age1'] <= 60), 'Age1'] = 5
    d.loc[(d['Age1'] > 60) & (d['Age1'] <= 70), 'Age1'] = 6
    d.loc[(d['Age1'] > 70) & (d['Age1'] <= 80), 'Age1'] = 7    

In [None]:
train_df.info()

In [None]:
# pd.DataFrame(train_df, columns=['Pclass','Sex1','Age1','Embarked1','not_alone','relatives','Fare','SibSp','Parch','Survived'])
sns.distplot(train_df['Fare'], kde=True, hist=True,bins=int(180/5), color = 'blue')
train_df['Fare'].sort_values(ascending=False)

In [None]:
ds = [train_df,test_df]
for d in ds:
    d.loc[ d['Fare'] <= 7.91, 'Fare1'] = 0
    d.loc[(d['Fare'] > 7.91) & (d['Fare'] <= 14.454), 'Fare1'] = 1
    d.loc[(d['Fare'] > 14.454) & (d['Fare'] <= 31), 'Fare1']   = 2
    d.loc[(d['Fare'] > 31) & (d['Fare'] <= 99), 'Fare1']   = 3
    d.loc[(d['Fare'] > 99) & (d['Fare'] <= 250), 'Fare1']   = 4
    d.loc[ d['Fare'] > 250, 'Fare1'] = 5
    d['Fare1'] = d['Fare1'].astype(int)
train_df['Fare1'].value_counts()

In [None]:
X_train = pd.DataFrame(train_df, columns=['Pclass','Sex1','Age1','Embarked1','not_alone','relatives','Fare1','SibSp','Parch'])
Y_train = pd.DataFrame(train_df, columns=['Survived'])
X_test = pd.DataFrame(test_df, columns=['Pclass','Sex1','Age1','Embarked1','not_alone','relatives','Fare1','SibSp','Parch'])

In [None]:
X_test.info()

In [None]:
sgd = SGDClassifier(max_iter=5,tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)

print(sgd.score(X_train,Y_train))
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
print(acc_sgd)

In [None]:
logreg = LogisticRegression().fit(X_train,Y_train)
Y_pred = logreg.predict(X_test)

test_df['Survived'] = Y_pred
pd.DataFrame(test_df, columns=['PassengerId', 'Survived']).to_csv('submit.csv', index=False)


print(logreg.score(X_train, Y_train))
acc_logreg = round(logreg.score(X_train, Y_train) * 100, 2)
print(acc_logreg)

In [None]:
sgd = SGDClassifier(max_iter=10,tol=None,loss='hinge')
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)

print(sgd.score(X_train,Y_train))
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
print(acc_sgd)

In [None]:
lsvm = LinearSVC().fit(X_train, Y_train)
Y_pred = lsvm.predict(X_test)

test_df['Survived'] = Y_pred
pd.DataFrame(test_df, columns=['PassengerId', 'Survived']).to_csv('submit.csv', index=False)

print(lsvm.score(X_train, Y_train))
acc_lsvm = round(lsvm.score(X_train, Y_train) * 100, 2)
print(acc_lsvm)

In [None]:
test_df['Survived'] = Y_pred
pd.DataFrame(test_df, columns=['PassengerId', 'Survived']).to_csv('submit.csv', index=False)

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)

test_df['Survived'] = Y_pred
pd.DataFrame(test_df, columns=['PassengerId', 'Survived']).to_csv('submit.csv', index=False)


print(rfc.score(X_train, Y_train))
print(round(rfc.score(X_train, Y_train)*100, 2))