# Importing data #

In [None]:
from ipywidgets import Button, Layout

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [None]:
titanic_data = pd.read_csv('titanic.csv')
print(titanic_data.head())

In [None]:
titanic_data.isnull().sum() # checking for missing values

# Cleaning the data

In [None]:
titanic_data = titanic_data.drop(columns=['cabin', 'boat', 'body', 'home.dest'], axis=1)
# removing attributes, that has no impact on a survived variable

In [None]:
titanic_data.isnull().sum()

In [None]:
titanic_data['age'].fillna(titanic_data['age'].mean(), inplace=True)
# replacing missing values in 'Age' column with the mean value

In [None]:
print(titanic_data['embarked'].mode())
titanic_data['embarked'].fillna(titanic_data['embarked'].mode()[0], inplace=True)
# replacing missing values in 'Embarked' column with the mode value

In [None]:
titanic_data['sex'].unique()

In [None]:
titanic_data['embarked'].unique()


In [None]:
titanic_data.replace({'sex':{'male':0,'female':1}, 'embarked':{'S':0,'C':1,'Q':2}}, inplace=True)
# converting non-numeric values into numbers

In [None]:
titanic_data = titanic_data.dropna()
# removing rows with missing data

In [None]:
titanic_data.isnull().sum()

In [None]:
titanic_data.dtypes
# checking data types for all columns

# Spliting the dataset on input data and variable we want to predict

In [None]:
X = titanic_data.drop(columns = ['name','ticket','survived'],axis=1) # we do not need these attributes for our model
Y = titanic_data['survived'] # Attribute that we want to predict

## Normalizing data

In [None]:
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import RobustScaler
from matplotlib import pyplot
data = X.values[::]
# perform a robust scaler transform of the dataset
trans = RobustScaler()
data = trans.fit_transform(data)
# convert the array back to a dataframe
dataset = pd.DataFrame(data)
# summarize
print(dataset.describe())
# histograms of the variables
dataset.hist()
pyplot.show()

## Spliting data on train (80%) and test (20%) subsets

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset,Y, test_size=0.2)

## Checking parameter's values which provide the highest accuracy of the model

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train,Y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

## Creating logistic regression model

In [None]:
model = LogisticRegression(C=0.1)

In [None]:
model.fit(X_train, Y_train)

In [None]:
X_train_prediction = model.predict(X_train)

### Testing accuracy

In [None]:
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

In [None]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of test data : ', test_data_accuracy)

In [None]:
model.intercept_

In [None]:
model.coef_

In [None]:
model.score(X, Y)

### Testing model for new cases
### I parameter - class 1, 2, 3                              
### II parameter - sex (woman: 1, man:0)                 
### III parameter - age              
### IV parameter - number of person's siblings/spouses on Titanic                
### V parameter - number of person's children on Titanic                  
### VI parameter - passenger fare (cost of a ticket)
### VII parameter - port of embarkation (Southampton: 0; Cherbourg: 1; Queenstown: 2)

In [None]:
input_data = (2, 1, 1, 0, 0, 72, 0) #creating an arrow of parameters

In [None]:
input_data_as_numpy_array = np.asarray(input_data)

In [None]:
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

In [None]:
prediction = model.predict(input_data_reshaped)
#printing prediction
if prediction[0]==0:
    print("Dead")
if prediction[0]==1:
    print("Alive")

In [None]:
my_list = model.predict_proba(input_data_reshaped).tolist()
dead = my_list[0][0]
alive = my_list[0][1]

In [None]:
print("Chances of death:")
print("{:.8f}".format(float(dead)))
print("Chances of survival:")
print("{:.8f}".format(float(alive)))