In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
from scipy import stats # for outlier finding
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visalisation
import seaborn as sns # ata visalisation
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import svm



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## A little data investigation:
* Finding outliers
* Finding missing values
* Finding correlated data


In [2]:
train_data = pd.read_csv('../input/titanic/train.csv')
test_data  = pd.read_csv('../input/titanic/test.csv')
train_data

There are 891 records and 12 columns.
We would now count extract each column type, some statistical operations, and 

In [3]:
train_data.describe()


In [4]:
train_data.dtypes

In [5]:
print('Percentage of NaN cells:')
print(train_data.isna().sum()/len(train_data)*100)
print('---')
print('Percentage of Null cells:')
print(train_data.isnull().sum()/len(train_data)*100)

* As we can see there are lots of missing values for Cabin column. 
* So we would simply drop it, as filling it with mean is not a logical approach!
* We can fill missing age values with mean of the column and fill lost embarked values with mode of the column (Since this column is categorical).
* Also PassengerID and Name are not useful columns for our dataset to detect whether the passenger has survived or not based on these columns. So, we would drop them too.


In [6]:
train_data = train_data.drop(['Cabin'],axis = 1)
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode())
train_data = train_data.drop(['PassengerId','Name'],axis = 1)
train_data

* In this stage wefillnad have some data transformation. We would use one-hot encoding for Sex, and Embarked since the are categorical. 
* We also need some transformation for Tickets, since the are a combination of charachters and digits. We would just pick the numerical value of the Ticket. Other columns are just fine for this stage.

In [7]:
for index,row in train_data.iterrows():
    value = str(row['Ticket']).split()[-1]
    if value.isdigit():
        train_data.at[index,'Ticket'] = int(str(row['Ticket']).split()[-1])
    else:
        train_data.at[index,'Ticket'] = 0
        
train_data['Ticket'] = train_data['Ticket'].astype('int')

In [8]:
# train_data['Ticket'] = train_data['Ticket'].map(lambda x: (str(x).split()[-1])).replace(',', '').replace('\n', '').astype(int)
# # train_data['Ticket'] = train_data['Ticket'].astype('string')
# # train_data['Ticket'] = a.astype('string')

In [9]:
# def int_taker(arg):
#     try:
#         return int(float(arg.split()[1]))
#     except:
#         return int(float(arg.split()[0]))

# # train_data['Ticket'] = train_data['Ticket'].map(lambda x: int_taker(str(x)))

train_data = pd.get_dummies(train_data, columns=['Sex','Embarked'])

Now we would try to find outliers based on z-score for each record.

In [10]:


# firts we would drop 'Suvived' as this is our target, and save it with variable name of Target

target = train_data['Survived']
train_data = train_data.drop(['Survived'],axis = 1)

columns = train_data.select_dtypes(include=np.number).columns

z_score = {}
for col in columns:
    z_score[col] = np.abs(stats.zscore(train_data[col]))
# z_score_df = pd.DataFrame(z_score, columns = columns)

z_score_df = pd.DataFrame(z_score, columns = columns)
z_score_df

In [11]:
for feature in z_score_df.columns:
    fig, ax = plt.subplots()
    z_score_df[feature].plot(kind = 'kde')
    ax.set_title(feature)
    
    quant_5, quant_25, quant_50, quant_75, quant_95 = z_score_df[feature].quantile(0.05), z_score_df[feature].quantile(0.25),z_score_df[feature].quantile(0.5), z_score_df[feature].quantile(0.75), z_score_df[feature].quantile(0.95)
    quants = [[quant_5, 0.6, 'r'], [quant_25, 0.8, 'g'], [quant_50, 1, 'b'],  [quant_75, 0.8, 'm'], [quant_95, 0.6,'k']]
    for i in quants:
        ax.axvline(i[0],alpha = i[1], linestyle = ":",color = i[2])
    ax.set_xlim(xmin = -1)

* As we can see all values are in a reasonable range of z-scores, so we can assume that there is no outliers in our dataset.
* Now we would see is there is any unusual correlation between any two pairs of columns on our dataset.


In [12]:
cor_mat = train_data.corr(method= 'pearson')
fig = plt.figure(figsize=(10,10))
sns.heatmap(cor_mat)

* as we expected, there is a high negative correlation between sex_male and sex_female, so we should just keep one.
* Also we can keep two of embarments since keeping all three is not reasonable.

In [13]:
train_data = train_data.drop(['Embarked_C'],axis = 1)
train_data = train_data.drop(['Sex_male'],axis = 1)
train_data

Now that data is cleaned and checked, we can use our models to make predictions.

## Classification metrics function

* we will make a function for our models to examine how it is performing.

In [14]:
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score


def metrics(model, xtest, ytest):
    print('F1 Score:',f1_score(xtest,ytest))
    print('Weighted F1 Score:',fbeta_score(xtest,ytest))
    print('Log Loss:',log_loss(xtest,ytest))
    print('AUC Score:',roc_auc_score(xtest,ytest))
    print('Recall Score:',recall_score(xtest,ytest))
    print('Precision Score:',precision_score(xtest,ytest))
    print('Accuracy Score:',accuracy_score(xtest,ytest))
    
    

## Logistic Regression
* Train the model based on a linear regression approach

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_data, target, test_size=0.3, random_state=42)

model1 = LogisticRegression(random_state=0).fit(X_train, y_train)
model1.score(X_val, y_val)

* Not an acceptable result! We try to make some adjustments to see if we can increase the accuracy.
* First we check for feature scaling:

In [16]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled_train = scaler.transform(X_train)
model2 = LogisticRegression(random_state=0).fit(X_scaled_train, y_train)
X_scaled_val = scaler.transform(X_val)
model2.score(X_scaled_val, y_val)

* A very satisfying increase in our accuracy. Now we would check if our data is imbalanced. If it was imbalanced we would use some weights to tackle tis situation.

In [37]:
print('Number of 1s in train set target:',sum(y_train))
print('total train number:',len(y_train))
print('Number of 1s in validation set target:',sum(y_val))
print('total validation number:',len(y_val))

* The data is slightly imbalanced, so we would use weights for our model

In [17]:
model3 = LogisticRegression(random_state=0, class_weight="balanced").fit(X_scaled_train, y_train)
model3.score(X_scaled_val, y_val)

* So this effort was not a good idea. lets check all parameters that sklearn suggest for logistic regression:

In [28]:
model4 = LogisticRegression(random_state=0, penalty = 'none').fit(X_scaled_train, y_train)
print('Model with no penalty term:')
print(model4.score(X_scaled_val, y_val),'\n')

model5 = LogisticRegression(random_state=0, fit_intercept = False).fit(X_scaled_train, y_train)
print('Model with no interception fit:')
print(model5.score(X_scaled_val, y_val),'\n')

model6 = LogisticRegression(random_state=0, solver = 'newton-cg').fit(X_scaled_train, y_train)
print('Model with newton-cg solver')
print(model6.score(X_scaled_val, y_val),'\n')

model7 = LogisticRegression(random_state=0, solver = 'liblinear').fit(X_scaled_train, y_train)
print('Model with liblinear solver')
print(model7.score(X_scaled_val, y_val),'\n')

model8 = LogisticRegression(random_state=0, solver = 'sag').fit(X_scaled_train, y_train)
print('Model with sag solver')
print(model8.score(X_scaled_val, y_val),'\n')

model9 = LogisticRegression(random_state=0, solver = 'saga').fit(X_scaled_train, y_train)
print('Model with saga solver')
print(model9.score(X_scaled_val, y_val),'\n')

model10 = LogisticRegression(random_state=0, max_iter = 200).fit(X_scaled_train, y_train)
print('Model with 200 iterations:')
print(model10.score(X_scaled_val, y_val),'\n')

model11 = LogisticRegression(random_state=0, multi_class = 'ovr').fit(X_scaled_train, y_train)
print('Model with binary classification:')
print(model11.score(X_scaled_val, y_val),'\n')


* So that was all we could do with logistic regression as other extra parameters didn't help us at all. So we would use model2.
* lets see the model's performance on test dataset. first we would do some preprocessing for the test data (removing Name, transforming Ticket, ...)

In [30]:
print('Percentage of NaN cells:')
print(test_data.isna().sum()/len(test_data)*100)
print('---')
print('Percentage of Null cells:')
print(test_data.isnull().sum()/len(test_data)*100)

In [43]:
test_data  = pd.read_csv('../input/titanic/test.csv')

test_data = test_data.drop(['Cabin'],axis = 1)
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mean())
test_data['Embarked'] = test_data['Embarked'].fillna(test_data['Embarked'].mode())
p_id = np.hstack((test_data.PassengerId.to_numpy().reshape(-1,1)))
test_data = test_data.drop(['PassengerId','Name'],axis = 1)


for index,row in test_data.iterrows():
    value = str(row['Ticket']).split()[-1]
    if value.isdigit():
        test_data.at[index,'Ticket'] = int(str(row['Ticket']).split()[-1])
    else:
        test_data.at[index,'Ticket'] = 0
        
test_data['Ticket'] = test_data['Ticket'].astype('int')

test_data = pd.get_dummies(test_data, columns=['Sex','Embarked'])

test_data = test_data.drop(['Embarked_C'],axis = 1)
X_test = test_data.drop(['Sex_male'],axis = 1)

X_scaled_test = scaler.transform(X_test)
y_hat = model2.predict(X_scaled_test)

y_hat = y_hat.reshape(-1,1)
result = np.hstack((p_id.reshape(-1,1),y_hat))
df = pd.DataFrame(result, columns = ['PassengerId', 'Survived'])
df.to_csv('submission.csv', index=False)
df

### Final accuracy for logistic regression model: 0.77033

* lets try ANN to see if there is any improvements

## Artificial Neural Network
* Train the model with a simple neural network (2 Hidden Layers)
* Train the model with deep neural netwok