# Import libraries

In [None]:
# linear algebra
import numpy as np

# data processing
import pandas as pd 

# data visualization
import seaborn as sns

# Logistic Regression 
from sklearn.linear_model import LogisticRegression

# split data
from sklearn.model_selection import train_test_split

# accuracy score
from sklearn.metrics import accuracy_score

# confusion matrix
from sklearn.metrics import confusion_matrix

# Performance metrics
from sklearn.metrics import classification_report

# Load Data

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head() #show the first 5 rows from the training dataset

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head() #show the first 5 rows from the testing dataset

# Data Exploration

In [None]:
#display all columns and their data types 
train_data.info() 

In [None]:
test_data.info() 

In [None]:
#confirm that there is null values
train_data.isnull().values.any() 

In [None]:
test_data.isnull().values.any() 

# Data Cleaning

In [None]:
#Converting the columns names to lowercase 
train_data.columns = [c.lower() for c in train_data.columns]
test_data.columns = [c.lower() for c in test_data.columns]

In [None]:
#Rename columns for train dataset
train_data.rename(columns={
            "passengerid":"passenger_id",
            "pclass":"passenger_class",
            "sibsp":"sibling_spouse",
            "parch":"parent_children"
        }, inplace=True)

#Rename columns for test dataset
test_data.rename(columns={
            "passengerid":"passenger_id",
            "pclass":"passenger_class",
            "sibsp":"sibling_spouse",
            "parch":"parent_children"
        }, inplace=True)

In [None]:
#fill age missing values with random numbers computed based on mean and the standard deviation 
#and change datatype to int on both datasets 

for dataset in [train_data, test_data]:
    mean = dataset["age"].mean()
    std = dataset["age"].std()
    is_null = dataset["age"].isnull().sum()
    
    # compute random numbers between the mean, std and is_null
    random_age = np.random.randint(mean - std, mean + std, size = is_null)
    
    # fill NaN values in Age column with random values generated
    age_copy = dataset["age"].copy()
    age_copy[np.isnan(age_copy)] = random_age
    dataset["age"] = age_copy
    dataset["age"] = dataset["age"].astype(int)

In [None]:
#fill the missing values for embarked in the train dataset
train_data.embarked.fillna(train_data.embarked.mode()[0], inplace = True)

In [None]:
#fill the missing values for fare in the test dataset
test_data.fare.fillna(test_data.fare.mode()[0], inplace = True)

In [None]:
#convert categrical columns to numerical
train_data['sex'].replace(['female','male'],[0,1],inplace = True)
test_data['sex'].replace(['female','male'],[0,1],inplace = True)

In [None]:
train_data['embarked'].replace(['C','Q','S'],[1,2,3], inplace = True)
test_data['embarked'].replace(['C','Q','S'],[1,2,3], inplace = True)

In [None]:
#remove columns (name - ticket - cabin) 
train_data.drop(labels = ["cabin", "name","ticket"], axis=1, inplace = True)
test_data.drop(labels = ["cabin", "name","ticket"], axis=1, inplace = True)

In [None]:
#check that age values are on propore range 
train_data.age.min()

In [None]:
train_data.age.max()

In [None]:
#show data after cleaning
train_data.info() 

In [None]:
test_data.info() 

# Data Visualization and Analysis

In [None]:
#Did passenger class made any difference to his survival?
sns.countplot("passenger_class", data=train_data, hue="survived")
sns.set_theme(style="darkgrid")

In [None]:
data =sns.countplot("sex", data=train_data, hue="survived")
data.set_xticklabels(["Female","Male"])
sns.set_theme(style="darkgrid")

In [None]:
   def age_group(age):
    if age >= 50:
        return 'Old'
    if 30 <= age < 50:
        return 'Adualt'
    if  20<= age < 30:
        return 'Young adualt'
    if  10<= age < 20:
        return 'Teenager'
    if  0<= age < 10:
        return 'Child'
    
train_data['age_group'] =train_data.age.apply(age_group)

In [None]:
data =sns.countplot("age_group", data=train_data,order=['Child','Teenager','Young adualt','Adualt','Old'], hue="survived")


In [None]:
def hasFamily(family):
    if family >= 1:
        return True
    else: 
        return False

has_parent_children =train_data.parent_children.apply(hasFamily)
has_sibling_spouse =train_data.sibling_spouse.apply(hasFamily)
train_data['has_family'] = has_sibling_spouse | has_parent_children

In [None]:
data =sns.countplot("has_family", data=train_data, hue="survived")

In [None]:
#drop culomns that was made for analysing
train_data.drop(labels = ["has_family", "age_group"], axis=1, inplace = True)

# Data Modeling & Predicition

In [None]:
#since the test_data doesnt contain the 'survived' column we cant test the 
#accurcy of the model so we will split the train data to two sets to build and test the model
train, test = train_test_split(train_data, test_size=0.3)

# specify the input features and the target output and the testing dataset
input_features = train.drop("survived",axis=1)
target = train["survived"]
for_testing = test.drop("survived",axis=1)

# build the model using Logistic Regression
model = LogisticRegression(solver='liblinear')
model.fit(input_features,target)

# predict the output of the testing dataset
predict = model.predict(for_testing)
predict

# Performance metrics

In [None]:
test_data_accuracy = accuracy_score(test["survived"], predict)
print('Accuracy score of testing data : ', test_data_accuracy)

In [None]:
print(classification_report( test["survived"], predict))

In [None]:
confusion_matrix(test["survived"], predict)

## Process for Submission File

In [None]:
# predect the values for the test_data
prediction = model.predict(test_data)
test_data["survived"] = prediction
test_data.drop(labels = ["passenger_class", "sex","age","sibling_spouse", "parent_children", "fare","embarked"], axis=1, inplace = True) 
test_data.head()