In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

df = pd.read_csv('/kaggle/input/titanic/test.csv')
df.head()

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df.dtypes

In [None]:
df.Sex = df.Sex.map({'male': 1, 'female': 0})
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
# Cabin column has 327 Nan values out of 418, So probably the best thing to do is drop the column
df.drop('Cabin', axis=1, inplace=True)

In [None]:
df.shape

In [None]:
# Replace the Nans in Fare column with mean
df.Fare.fillna(df.Fare.mean(), inplace=True)

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df.tail()

In [None]:
df.Parch.value_counts()

In [None]:
# Let's visualize our data
output = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
output.shape

In [None]:
output.head()

In [None]:
sub_df = pd.crosstab(df.Embarked, output.Survived)
sub_df.plot(kind='bar', color=['salmon', 'lightblue'])
plt.legend(['Not Survived', 'Survived'])
plt.ylabel('Frequency')
plt.xlabel('Embarked')
plt.show();

In [None]:
sub_df = pd.crosstab(df.Pclass, output.Survived)
sub_df.plot(kind='bar', color=['salmon', 'lightblue'])
plt.legend(['Survived', 'Not Survived'])
plt.ylabel('Frequency')
plt.xlabel('Class')
plt.show();

In [None]:
df['Sex'].value_counts()

In [None]:
sub_df = pd.crosstab(df.Pclass, df.Sex)
sub_df.plot(kind='bar', color=['salmon', 'lightblue'])
plt.legend(['Female', 'Male'])
plt.ylabel('Frequency')
plt.xlabel('Class')
plt.show();

In [None]:
df['Survived'] = output.Survived

In [None]:
df.head()

In [None]:
sub_df = pd.crosstab(df.Survived, df.Sex)
sub_df.plot(kind='bar', color=['salmon', 'lightblue'])
plt.legend(['Female', 'Male'])
plt.ylabel('Frequency')
plt.xlabel('Survival Status')
plt.show();

In [None]:
df['Age'].plot(kind='hist', color='lightblue')
plt.show()

In [None]:
df['Age'].mean()

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [None]:
df.head(10)

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
df.drop('Survived', axis=1, inplace=True)

In [None]:
df.drop(['Name', 'Ticket'], axis=1, inplace=True)
df.Embarked = df['Embarked'].map({'Q': 0, 'S': 1, 'C': 2})

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
# Split the data
from sklearn.model_selection import train_test_split

# Create x and y Dataframes
df['Survived'] = output.Survived
x = df.drop('Survived', axis=1)
y = df['Survived']

# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)
X = scaler.transform(x)
X = pd.DataFrame(X)

x_tr, x_tst, y_tr, y_tst = train_test_split(X, y, test_size=0.2)
x_tr.shape, x_tst.shape

In [None]:
# Build a Classification Model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

# Fit the model
lr.fit(x_tr, y_tr)

In [None]:
# Evaluate the metrics
lr.score(x_tr, y_tr), lr.score(x_tst, y_tst)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_preds(y_true, y_preds):
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    
    metric_dict = {
        'accuracy': round(accuracy, 2),
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1': round(f1, 2)
    }
    
    print(f"Accuracy: {metric_dict['accuracy']}")
    print(f"Precision: {metric_dict['precision']}")
    print(f"Recall: {metric_dict['recall']}")
    print(f"F1 Score: {metric_dict['f1']}")
    
    return metric_dict

In [None]:
y_preds = lr.predict(x_tst)
evaluate_preds(y_tst, y_preds)

In [None]:
# Display the Confusion Matrix
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

matrix = confusion_matrix(y_tst, y_preds, labels=lr.classes_)
graph = ConfusionMatrixDisplay(confusion_matrix=matrix, display_labels=lr.classes_)

graph.plot()
plt.show()

In [None]:
# Import ROC curve from the sklearn.metrics

# Plot ROC curve and calculate AUC metric
from sklearn.metrics import RocCurveDisplay, roc_auc_score
print(roc_auc_score(y_tst, y_preds), '\n')
RocCurveDisplay.from_estimator(lr, x_tst, y_tst);

In [None]:
# Feature Importance

# Feature importance is another way of asking, 'which features contribute the most to the outcomes of the model and how did they contribute?'
# Finding feature importance is different for each ML Model
feature_dict = dict(zip(x.columns, list(lr.coef_[0])))
feature_dict

In [None]:
features = pd.DataFrame(feature_dict, index=[0])
features.T.plot.bar(title='Feature Importance' , color='lightblue', legend=False)
plt.show()

In [None]:
# Cleaning the test dataset
test = pd.read_csv('/kaggle/input/titanic/test.csv')
# Dropping the unnecessary features
test.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)
# Filling the missing data
test['Age'] = test['Age'].fillna(test.Age.mean())
test['Sex'] = test['Sex'].map({'male': 1, 'female': 0})
test['Embarked'] = test['Embarked'].map({'Q': 0, 'S': 1, 'C': 2})
test['Fare'] = test.Fare.fillna(test.Fare.mean())
# Scale the data
scaler.fit(test)
X_test = scaler.transform(test)
X_test = pd.DataFrame(X_test)
# Make predictions
y_pred = lr.predict(X_test)
y_pred

In [None]:
submission = output
submission['Survived'] = list(map(int, y_pred))
submission.to_csv('submission.csv', index=False)

In [None]:
# Save the model and Load it
import pickle
pickle.dump(lr, open('Model.pkl', 'wb'))
loaded_model = pickle.load(open('Model.pkl', 'rb'))

In [None]:
DF = pd.read_csv('FinalData.csv')
DF.Survived