In [125]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, StratifiedKFold, learning_curve, train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import LabelEncoder


from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump, load

from sklearn.feature_selection import RFECV

In [126]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
df_data= pd.concat([train_data, test_data], ignore_index=True)

In [127]:
# extracted title using name
df_data['Title'] = df_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df_data['Title'] = df_data['Title'].replace(['Capt', 'Col', 'Countess', 'Don',
                                               'Dr', 'Dona', 'Jonkheer', 
                                                'Major','Rev','Sir'],'Rare') 
df_data['Title'] = df_data['Title'].replace(['Mlle', 'Ms','Mme'],'Miss')
df_data['Title'] = df_data['Title'].replace(['Lady'],'Mrs')
df_data['Title'] = df_data['Title'].map({"Mr":0, "Rare" : 1, "Master" : 2,"Miss" : 3, "Mrs" : 4 })
Ti = df_data.groupby('Title')['Age'].median()
Ti

Title
0    29.0
1    47.0
2     4.0
3    22.0
4    36.0
Name: Age, dtype: float64

In [128]:
Ti_pred = df_data.groupby('Title')['Age'].median().values
df_data['Ti_Age'] = df_data['Age']
# Filling the missing age
for i in range(0,5):
 # 0 1 2 3 4 5
    df_data.loc[(df_data.Age.isnull()) & (df_data.Title == i),'Ti_Age'] = Ti_pred[i]
df_data['Ti_Age'] = df_data['Ti_Age'].astype('int')
df_data['Ti_Minor'] = ((df_data['Ti_Age']) < 16.0) * 1

In [129]:
# Filling missing values
df_data['Fare'] = df_data['Fare'].fillna(df_data['Fare'].median())

# Making Bins
df_data['FareBin_4'] = pd.qcut(df_data['Fare'], 4)
df_data['FareBin_5'] = pd.qcut(df_data['Fare'], 5)
df_data['FareBin_6'] = pd.qcut(df_data['Fare'], 6)

label = LabelEncoder()
df_data['FareBin_Code_4'] = label.fit_transform(df_data['FareBin_4'])
df_data['FareBin_Code_5'] = label.fit_transform(df_data['FareBin_5'])
df_data['FareBin_Code_6'] = label.fit_transform(df_data['FareBin_6'])



In [130]:
df_data['FamilySize'] = df_data['SibSp'] + df_data['Parch'] + 1

In [133]:
deplicate_ticket = []
for tk in df_data.Ticket.unique():
    tem = df_data.loc[df_data.Ticket == tk, 'Fare']
    #print(tem.count())
    if tem.count() > 1:
        #print(df_data.loc[df_data.Ticket == tk,['Name','Ticket','Fare']])
        deplicate_ticket.append(df_data.loc[df_data.Ticket == tk,['Name','Ticket','Fare','Cabin','FamilySize','Survived']])
deplicate_ticket = pd.concat(deplicate_ticket)
deplicate_ticket.head(14)

Unnamed: 0,Name,Ticket,Fare,Cabin,FamilySize,Survived
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",PC 17599,71.2833,C85,2,1.0
1125,"Cumings, Mr. John Bradley",PC 17599,71.2833,C85,2,
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803,53.1,C123,2,1.0
137,"Futrelle, Mr. Jacques Heath",113803,53.1,C123,2,0.0
6,"McCarthy, Mr. Timothy J",17463,51.8625,E46,1,0.0
1037,"Hilliard, Mr. Herbert Henry",17463,51.8625,E46,1,
7,"Palsson, Master. Gosta Leonard",349909,21.075,,5,0.0
24,"Palsson, Miss. Torborg Danira",349909,21.075,,5,0.0
374,"Palsson, Miss. Stina Viola",349909,21.075,,5,0.0
567,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",349909,21.075,,5,0.0


In [134]:

# the same ticket family or friends
df_data['Connected_Survival'] = 0.5 # default 
for _, df_grp in df_data.groupby('Ticket'):
    if (len(df_grp) > 1):
        for ind, row in df_grp.iterrows():
            smax = df_grp.drop(ind)['Survived'].max()
            smin = df_grp.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                df_data.loc[df_data['PassengerId'] == passID, 'Connected_Survival'] = 1
            elif (smin==0.0):
                df_data.loc[df_data['PassengerId'] == passID, 'Connected_Survival'] = 0
#print
print('people keep the same ticket: %.0f '%len(deplicate_ticket))
print("people have connected information : %.0f" 
      %(df_data[df_data['Connected_Survival']!=0.5].shape[0]))
df_data.groupby('Connected_Survival')[['Survived']].mean().round(3)

people keep the same ticket: 596 
people have connected information : 496


Unnamed: 0_level_0,Survived
Connected_Survival,Unnamed: 1_level_1
0.0,0.225
0.5,0.298
1.0,0.728


In [135]:
train_data = df_data[:len(train_data)]
test_data = df_data[len(train_data):]

In [136]:
# Drop the columns
train_data = train_data.drop(['Cabin','Name','PassengerId','Ticket','Age'], axis=1)
# Remove the rows with missing values
train_data = train_data.dropna(subset =  ['Embarked'])

In [137]:
test_data = test_data.drop(['Cabin','Name','Ticket','Age','Survived'], axis=1)

In [138]:
labelencoder = LabelEncoder()

# Encode the sex column
train_data.iloc[:, 2] = labelencoder.fit_transform(train_data.iloc[:, 2].values)
test_data.iloc[:, 2] = labelencoder.fit_transform(test_data.iloc[:, 2].values)

# Encode the embarked column
train_data.iloc[:, 6] = labelencoder.fit_transform(train_data.iloc[:, 6].values)
test_data.iloc[:, 6] = labelencoder.fit_transform(test_data.iloc[:, 6].values)

In [139]:
train_data.isna().sum()

Survived              0
Pclass                0
Sex                   0
SibSp                 0
Parch                 0
Fare                  0
Embarked              0
Title                 0
Ti_Age                0
Ti_Minor              0
FareBin_4             0
FareBin_5             0
FareBin_6             0
FareBin_Code_4        0
FareBin_Code_5        0
FareBin_Code_6        0
FamilySize            0
Connected_Survival    0
dtype: int64

In [140]:
test_data.isna().sum()

PassengerId           0
Pclass                0
Sex                   0
SibSp                 0
Parch                 0
Fare                  0
Embarked              0
Title                 0
Ti_Age                0
Ti_Minor              0
FareBin_4             0
FareBin_5             0
FareBin_6             0
FareBin_Code_4        0
FareBin_Code_5        0
FareBin_Code_6        0
FamilySize            0
Connected_Survival    0
dtype: int64

In [100]:
f = ['Sex', 'Pclass', 'FareBin_Code_5', 'Ti_Minor', 'FamilySize']
y = train_data.iloc[:, 0].values  # Ensure this is the correct target variable

# Split the data


In [141]:
b4, b5, b6 = ['Sex', 'Pclass','FareBin_Code_4','Ti_Minor','Connected_Survival'], ['Sex','Pclass','FareBin_Code_5','Ti_Minor','Connected_Survival'],\
['Sex','Pclass','FareBin_Code_6','Ti_Minor','Connected_Survival']
b4_Model = RandomForestClassifier(random_state=2,n_estimators=250,min_samples_split=20,oob_score=True)
b4_Model.fit(train_data[b4], y)
b5_Model = RandomForestClassifier(random_state=2,n_estimators=250,min_samples_split=20,oob_score=True)
b5_Model.fit(train_data[b5], y)
b6_Model = RandomForestClassifier(random_state=2,n_estimators=250,min_samples_split=20,oob_score=True)
b6_Model.fit(train_data[b6], y)
print('b4 oob score :%.5f' %(b4_Model.oob_score_))
print('b5 oob score :%.5f '%(b5_Model.oob_score_))
print('b6 oob score : %.5f' %(b6_Model.oob_score_))

b4 oob score :0.84364
b5 oob score :0.84477 
b6 oob score : 0.84364


In [142]:
# submits
X_Submit = test_data.drop(labels=['PassengerId'],axis=1)

b5_pred = b5_Model.predict(X_Submit[b5])

submit = pd.DataFrame({"PassengerId": test_data['PassengerId'],
                      "Survived":b5_pred.astype(int)})
submit.to_csv("gender_submission.csv",index=False)

In [143]:
X_train, X_val, y_train, y_val = train_test_split(train_data[b5], y, test_size=0.2, random_state=0)

In [144]:


# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=2, n_estimators=250, min_samples_split=20, oob_score=True),
    'Support Vector Machine': SVC(probability=True, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(criterion='entropy', random_state=0),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}


# Store results
results = {}

# Evaluate classifiers
for name, clf in classifiers.items():
    # Fit the classifier
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_val)
    y_pred_proba = clf.predict_proba(X_val)[:, 1]  # Probabilities for ROC AUC
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    
    # Store results
    results[name] = {"Accuracy": accuracy, "ROC AUC": roc_auc}
    

# Optional: Display results in a DataFrame for better readability
results_df = pd.DataFrame(results).T
print(results_df)

                        Accuracy   ROC AUC
Logistic Regression     0.741573  0.816960
Random Forest           0.764045  0.819700
Support Vector Machine  0.758427  0.800391
K-Nearest Neighbors     0.775281  0.816960
Decision Tree           0.758427  0.818852
Gradient Boosting       0.769663  0.831050


In [107]:
model_pipeline = Pipeline(steps=[
        ('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, random_state=42))
    ])

# Fit the model
model_pipeline.fit(X_train, y_train)

# Save the model
dump(model_pipeline, 'best_model_pipeline.joblib')

['best_model_pipeline.joblib']

In [109]:
model = load('best_model_pipeline.joblib')
# submits
X_Submit = test_data.drop(labels=['PassengerId'],axis=1)

b4_pred = model.predict(X_Submit[b5])

submit = pd.DataFrame({"PassengerId": test_data['PassengerId'],
                      "Survived":b4_pred.astype(int)})
submit.to_csv("gender_submission.csv",index=False)