In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [4]:
df = pd.read_csv('dataset/Titanic-Dataset.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
len(df)

891

In [5]:
df.Embarked.value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [7]:
cat_features = df.select_dtypes(include=['object']).columns.tolist()
num_features = df.select_dtypes(include=['int', 'float']).columns.tolist()

In [8]:
cat_features

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [9]:
num_features

['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [10]:
# To see the class distrubution of Embarked column
df.Embarked.value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [11]:
from sklearn.impute import SimpleImputer
mos_frequesnt = SimpleImputer(strategy='most_frequent')
df['Embarked'] = mos_frequesnt.fit_transform(df[['Embarked']])

ValueError: 2

In [13]:
categorial_features = ['Sex', 'Embarked']
nummerical_features = ['Age', 'SibSp', 'Parch', 'Fare']

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler


In [17]:
ct = ColumnTransformer(
    [("text_preprocess", CountVectorizer(), "documents"),
     ("num_preprocess", MinMaxScaler(), ["width"])])

#Mode 
#OrdinalEncoder

cat_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
num_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [18]:
ct

0,1,2
,transformers,"[('text_preprocess', ...), ('num_preprocess', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [19]:
categorical_features = ['Sex', 'Embarked']
category_orders = [
    ['female', 'male'], 
    ['S', 'C', 'Q']     
]
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('ordinal', OrdinalEncoder(
        categories=category_orders,
        handle_unknown='use_encoded_value', 
        unknown_value=-1                   
    ))                                    
])

In [20]:
numerical_features = ['Age', 'Fare', 'Pclass', 'SibSp', 'Parch']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())                    
])

In [21]:
#drop means other columns will not be considered! We only use 8 columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'  
)

In [22]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [None]:
#This function organizes all the preprocessing steps and model training into a single object/ one pipeline!

lr=LogisticRegression()
model =Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lr)
])

In [24]:
model

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['female', 'male'], ['S', 'C', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [28]:
final_columns = categorial_features + numerical_features + ['Survived']
df_processed = pd.DataFrame(df, columns=final_columns)
df_processed.head()

Unnamed: 0,Sex,Embarked,Age,Fare,Pclass,SibSp,Parch,Survived
0,male,S,22.0,7.25,3,1,0,0
1,female,C,38.0,71.2833,1,1,0,1
2,female,S,26.0,7.925,3,0,0,1
3,female,S,35.0,53.1,1,1,0,1
4,male,S,35.0,8.05,3,0,0,0


In [None]:
#Should calculate the accuracy score!
results = model.predict(X_test)
results

In [None]:
from sklearn metrics import accuracy_score

In [None]:
import pickle
with open('titanic_model.pkl', 'wb') as f:
    pickle.dump(lr,f)

In [None]:
#reload the model
with open('titanic_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [None]:
#Inferecing 


In [29]:
df_processed.head()

Unnamed: 0,Sex,Embarked,Age,Fare,Pclass,SibSp,Parch,Survived
0,male,S,22.0,7.25,3,1,0,0
1,female,C,38.0,71.2833,1,1,0,1
2,female,S,26.0,7.925,3,0,0,1
3,female,S,35.0,53.1,1,1,0,1
4,male,S,35.0,8.05,3,0,0,0


In [34]:
df_processed.columns

Index(['Sex', 'Embarked', 'Age', 'Fare', 'Pclass', 'SibSp', 'Parch',
       'Survived'],
      dtype='object')

In [31]:
df_processed['Parch'].unique()

array([0, 1, 2, 5, 3, 4, 6])

In [None]:
ge

In [None]:
#0 means not_survived! 
