# Load Libary

In [389]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Import Data

In [390]:
data_train = pd.read_csv('./data/train.csv')
data_test = pd.read_csv('./data/test.csv')
data_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Preprocessing

### Fill missing data

In [391]:
age_train_mean = data_train['Age'].mean().__round__()
age_test_mean = data_test['Age'].mean().__round__()
data_train['Age'].fillna(value=age_train_mean, inplace=True)
data_test['Age'].fillna(value=age_test_mean, inplace=True)
data_train['Embarked'].fillna(value='U', inplace=True)
data_test['Embarked'].fillna(value='U', inplace=True)
data_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

### Drop unused columns

In [392]:
def drop(data):
    cols = ["Name", "Cabin", "PassengerId", "Parch", "Fare", "SibSp"]
    for col in cols:
        data.drop(col, axis=1,inplace=True)

passengerIds = data_test["PassengerId"]
drop(data_train)
drop(data_test)

### Label Encoding 

In [393]:
def encoding(data):
    cols = ['Sex', 'Ticket', 'Embarked', ]
    le = preprocessing.LabelEncoder()
    for col in cols:
        data[col] = le.fit_transform(data[col])

encoding(data_train)
encoding(data_test)
data_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Embarked
0,0,3,1,22.0,523,2
1,1,1,0,38.0,596,0
2,1,3,0,26.0,669,2
3,1,1,0,35.0,49,2
4,0,3,1,35.0,472,2


# Logistic Regression

### split dataset into test and train

In [394]:
Y = data_train['Survived']
X = data_train.drop("Survived", axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, train_size=0.8, random_state=10)


### build the logistic model

In [395]:
clf = LogisticRegression(random_state=0, max_iter=10000).fit(X_train, Y_train)

### make predictions

In [396]:
predictions = clf.predict(X_test)
accuracy_score(Y_test, predictions)

0.8435754189944135

# Submission

In [397]:
submission_predictions = clf.predict(data_test)
submission_df = pd.DataFrame({"PassengerId" : passengerIds, "Survived" : submission_predictions})
submission_df.to_csv("submission.csv", index=False)