In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

train['train_test'] = 1
test['train_test'] = 0
test['Survived'] = np.NaN
data = pd.concat([train,test])

%matplotlib inline
data.columns

In [None]:
df_num = train[["Age","SibSp", "Parch", "Fare"]]
df_cat = train[["Survived", "Pclass", "Sex", "Cabin", "Embarked","Ticket"]]

In [None]:
for column in df_num.columns:
    plt.figure(figsize=(8, 6))  # Set the figure size for each histogram
    plt.hist(df_num[column]) 
    plt.xlabel(column)  # Set the x-axis label to the column name
    plt.ylabel('Frequency')
    plt.title(f'Histogram of {column}')
    plt.show()
    

In [None]:
sns.heatmap(df_num.corr())
print(df_num.corr())

In [None]:
pivot_table = train.pivot_table(
   index = "Survived",
    values = ["Age", "SibSp", "Fare", "Parch"]
    )
pivot_table


In [None]:
for column in df_cat.columns:
    sns.barplot(
        x = df_cat[column].value_counts().index,
        y = df_cat[column].value_counts()
    ).set_title(column)
    plt.show()

In [None]:
print(pd.pivot_table(train, index = "Survived", columns = "Sex", 
                     values = "Ticket", aggfunc = "count"))

print(pd.pivot_table(train, index = "Survived", columns = "Pclass", 
                     values = "Ticket", aggfunc = "count"))

print(pd.pivot_table(train, index = "Survived", columns = "Embarked", 
                     values = "Ticket", aggfunc = "count"))

print(pd.pivot_table(train, index = "Survived", columns = "Cabin", 
                     values = "Ticket", aggfunc = "count"))

In [None]:
train["Cabin"].value_counts()

In [None]:
train["Multiple_cabins"] = train.Cabin.apply(lambda column: 0 if pd.isna(column) 
                                             else len(column.split(" ")))
train["Multiple_cabins"].value_counts()

In [None]:
#Define a function to calculate the number of cabins
def number_of_cabins(column):
    if pd.isna(column):
        return 0
    else:
        return len(column.split(" "))
        
train["multiple_cabin"] = train["Cabin"].apply(number_of_cabins)
train["multiple_cabin"].value_counts()

In [None]:
pd.pivot_table(train, index="Survived", columns = "Multiple_cabins",
               values = "Ticket", aggfunc = "count")

In [None]:
#Check the letters of the cabins 
train["Cabin_letters"] = train.Cabin.apply(lambda column: str(column)[0])
train["Cabin_letters"]

In [None]:
pd.pivot_table(train, index = "Survived", columns = "Cabin_letters",
               values = "Name", aggfunc = "count")
train["Cabin_letters"].value_counts()

In [None]:
pivot_table_2 = train.pivot_table(index = "Survived", columns = "Cabin_letters", 
                                  values = "Name", aggfunc = "count" )
pivot_table_2

In [None]:
train["ticket_numerics"] = train.Ticket.apply(lambda column: 1 if column.isnumeric() else 0 )
train["ticket_letters"] = train.Ticket.apply(lambda column: ' '.join(column.split(' ')[:-1])
                                             .replace('.', '').replace('/', '').lower()
                                            if len(column.split(' ')[:-1]) >0 else 0)

In [None]:
pd.set_option("display.max_rows", None)
train["ticket_letters"].value_counts()

In [None]:
train.pivot_table(index = "Survived", columns="ticket_numerics", values="Ticket", aggfunc = "count")

In [None]:
train.pivot_table(index="Survived", columns = "ticket_letters", values = "Ticket", aggfunc="count")

In [None]:
train["Name"].head(10)
train["name_title"] = train.Name.apply(lambda column: column.split(",")[1].split(".")[0].strip())
train["name_title"].value_counts()

In [None]:
#create the categorical variables
data["Multiple_cabins"] = data.Cabin.apply(lambda column: 0 if pd.isna(column) 
                                             else len(column.split(" ")))
data["Cabin_letters"] = data.Cabin.apply(lambda column: str(column)[0])
data["ticket_numerics"] = data.Ticket.apply(lambda column: 1 if column.isnumeric() else 0 )
data["ticket_letters"] = data.Ticket.apply(lambda column: ' '.join(column.split(' ')[:-1])
                                             .replace('.', '').replace('/', '').lower()
                                            if len(column.split(' ')[:-1]) >0 else 0)
data["name_title"] = data.Name.apply(lambda column: column.split(",")[1].split(".")[0].strip())


#drop the null values in the embarked column
data.dropna(subset = ["Embarked"], inplace=True)

#include the relevant data
data["Age"] = data["Age"].fillna(data.Age.mean())
data["Fare"] = data["Fare"].fillna(data.Fare.median())

#transform the categorical data
data["norm_sibsp"] = np.log(data.SibSp+1)
data["norm_sibsp"].hist()
#impute the fare ad the age data
data["norm_fare"] = np.log(data.Fare+1)
data["norm_fare"].hist()

#converting pclass to a str from a int
data["Pclass"].astype(str)
#getting the dummies
data_dummies = pd.get_dummies(data[["Pclass", "Sex", "Age", "SibSp", "Parch", "norm_fare", 
                "Embarked", "Cabin_letters", "Multiple_cabins", "ticket_numerics",
                "name_title", "train_test"]])

#splittig the data ito train and test sets
X_train = data_dummies[data_dummies.train_test == 1].drop(['train_test'], axis =1)
X_test = data_dummies[data_dummies.train_test == 0].drop(['train_test'], axis =1)


y_train = data[data.train_test==1].Survived
y_train.shape

In [None]:
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()

data_dummies_scaled = data_dummies.copy()
data_dummies_scaled[["Age", "SibSp", "Parch", "norm_fare"]] = scale.fit_transform(
    data_dummies_scaled[["Age", "SibSp", "Parch", "norm_fare"]])

X_trained_scaled = data_dummies_scaled[
    data_dummies_scaled.train_test == 1].drop(['train_test'], axis =1)

X_test_scaled = data_dummies_scaled[
    data_dummies_scaled.train_test == 0].drop(['train_test'], axis =1)

y_train = data[data.train_test==1].Survived


In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

#list of Models
models = [
    #LinearRegression(),
    LogisticRegression(max_iter = 2000),
    DecisionTreeClassifier(random_state = 42),
    RandomForestClassifier(random_state = 42),
    GradientBoostingClassifier(n_estimators=100, random_state =42),
    SVC(probability = True),
    KNeighborsClassifier(),
    GaussianNB(),
]

for model in models:
    scores = cross_val_score(model, X_train, y_train, cv = 5, scoring="accuracy")
    scores_1 = cross_val_score(model, X_trained_scaled, y_train, cv = 5, scoring="accuracy")
    print(f"Model: {model.__class__.__name__}")
    print("Cross-Validation Scores:", scores)
    print("Mean Accuracy:", np.mean(scores))
    print("Mean Accuracy:", np.mean(scores_1))

In [None]:
from sklearn.ensemble import VotingClassifier

#Create an instance of all the models
log = LogisticRegression(max_iter = 2000) 
D_trees = DecisionTreeClassifier(random_state = 42)
r_forest = RandomForestClassifier(random_state = 42)
g_boost = GradientBoostingClassifier(random_state =42)
knn = KNeighborsClassifier()
naive_bayes = GaussianNB() 

v_clf = VotingClassifier(estimators = [
    ("log", log),
    ("D_trees",D_trees),
    ("r_forest", r_forest),
    ("g_boost", g_boost),
    ("knn", knn),
    ("naives_bayes", naive_bayes)
], voting="soft")

scores = cross_val_score(v_clf, X_train, y_train, cv=5)
print(scores)
print(scores.mean())

In [None]:
v_clf.fit(X_trained_scaled,y_train)
y_hat_base_vc = v_clf.predict(X_test_scaled).astype(int)
basic_submission = {'PassengerId': test.PassengerId, 'Survived': y_hat_base_vc}
base_submission = pd.DataFrame(data=basic_submission)
base_submission.to_csv('base_submission.csv', index=False)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

#defining the hyperparameters for all the possible models
params_grids = {
    "logistic_regression" : {'max_iter' : [2000],
                      'penalty' : ['l1', 'l2'],
                      'C' : np.logspace(-4, 4, 20),
                      'solver' : ['liblinear']},
    
    "knn": { 'n_neighbors' : [3,5,7,9],
              'weights' : ['uniform', 'distance'],
              'algorithm' : ['auto', 'ball_tree','kd_tree'],
              'p' : [1,2]},
    
    "random_forest": {'n_estimators': [100,500,1000], 
                                  'bootstrap': [True,False],
                                  'max_depth': [3,5,10,20,50,75,100,None],
                                  'min_samples_leaf': [1,2,4,10],
                                  'min_samples_split': [2,5,10]},
    
    "gradient_boosting": {'n_estimators': [50, 100, 200], 'max_depth': [15, 20, 25]},
}

# Models to be used in grid search and randomized search
models = {
    'logistic_regression': LogisticRegression(),
    'random_forest': RandomForestClassifier(),
    'gradient_boosting': GradientBoostingClassifier(),
    'knn': KNeighborsClassifier(),
    
}


#performing randomized and grid search  for every model
for model_name, model in models.items():
    print(f"Performing Grid Search for {model_name}...")
    grid_search = GridSearchCV(estimator = model, param_grid = params_grids[model_name], 
                              cv = 5, verbose = True, n_jobs = -1,
                               scoring = "accuracy").fit(X_trained_scaled, y_train)
    best_params_grid = grid_search.best_params_
    best_score_grid = grid_search.best_score_
    best_estimator_grid = grid_search.best_estimator_
    
    print(f"Hyperparameters (Grid Search) for {model_name}: {best_params_grid}")
    print(f"Best Score (Grid Search)  for {model_name}: {best_score_grid}\n")
    print(f"Best Estimator (Grid Search) for {model_name}: {best_estimator_grid}\n")
    
    
    print(f"Performing Randomized Search for {model_name}...")
    param_dist = params_grids[model_name]
    randomized_search = RandomizedSearchCV(estimator = model, param_distributions=param_dist, 
                                           n_iter = 100, cv = 5, verbose = True, 
                                           n_jobs = -1).fit(X_trained_scaled, y_train)
    
    best_params_randomized = randomized_search.best_params_
    best_score_randomized =  randomized_search.best_score_
    print(f"Best Hyperparameters (Randomized Search) for {model_name}: {best_params_randomized}")
    print(f"Best Score (Randomized Search) for {model_name}: {best_score_randomized}\n")
    

In [None]:
#simple performance reporting function
def clf_performance(classifier, model_name):
    print(model_name)
    print('Best Score: ' + str(classifier.best_score_))
    print('Best Parameters: ' + str(classifier.best_params_))

In [None]:
lr = LogisticRegression()
param_grid = {'max_iter' : [2000],
              'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear']}

clf_lr = GridSearchCV(lr, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_lr = clf_lr.fit(X_trained_scaled,y_train)
clf_performance(best_clf_lr,'Logistic Regression')

In [None]:
knn = KNeighborsClassifier()
param_grid = {'n_neighbors' : [3,5,7,9],
              'weights' : ['uniform', 'distance'],
              'algorithm' : ['auto', 'ball_tree','kd_tree'],
              'p' : [1,2]}
clf_knn = GridSearchCV(knn, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_knn = clf_knn.fit(X_trained_scaled,y_train)
clf_performance(best_clf_knn,'KNN')

In [None]:
rf = RandomForestClassifier(random_state = 1)
param_grid =  {'n_estimators': [400,450,500,550],
               'criterion':['gini','entropy'],
                                  'bootstrap': [True],
                                  'max_depth': [15, 20, 25],
                                  'min_samples_leaf': [2,3],
                                  'min_samples_split': [2,3]}
                                  
clf_rf = GridSearchCV(rf, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf = clf_rf.fit(X_trained_scaled,y_train)
clf_performance(best_clf_rf,'Random Forest')

In [None]:
best_rf = best_clf_rf.best_estimator_.fit(X_trained_scaled,y_train)
feat_importances = pd.Series(best_rf.feature_importances_, index=X_trained_scaled.columns)
feat_importances.nlargest(20).plot(kind='barh')

In [None]:
best_estimators = {}

for model_name, model in models.items():
    best_estimators[f"{model_name}_GridSearch"] = grid_search.best_estimator_
    best_estimators[f'{model_name}_RandomizedSearch'] = randomized_search.best_estimator_
    
    print(f"Best Estimator{best_estimators}")
#doing now the voting  classifier hard and soft
voting_clf_hard = VotingClassifier(
    estimators=[(model_name, estimator) for model_name, estimator in best_estimators.items()],
    voting="hard"
)
voting_clf_soft = VotingClassifier(
    estimators=[(model_name, estimator) for model_name, estimator in best_estimators.items()],
    voting= "soft"
)

print('voting_clf_hard :',cross_val_score(voting_clf_hard,X_train,y_train,cv=5))
print('voting_clf_hard mean :',cross_val_score(voting_clf_hard,X_train,y_train,cv=5).mean())

print('voting_clf_soft :',cross_val_score(voting_clf_soft,X_train,y_train,cv=5))
print('voting_clf_soft mean :',cross_val_score(voting_clf_soft,X_train,y_train,cv=5).mean())

In [None]:
best_lr = best_clf_lr.best_estimator_
best_knn = best_clf_knn.best_estimator_
best_rf = best_clf_rf.best_estimator_


voting_clf_hard = VotingClassifier(estimators = [
    ('knn',best_knn),('rf',best_rf)], voting = "hard")  
voting_clf_soft = VotingClassifier(estimators = [
    ('knn',best_knn),('rf',best_rf)], voting = "soft") 
voting_clf_all = VotingClassifier(estimators = [
    ('knn',best_knn),('rf',best_rf), ('lr', best_lr)], voting = 'soft') 

print('voting_clf_hard :',cross_val_score(voting_clf_hard,X_train,y_train,cv=5))
print('voting_clf_hard mean :',cross_val_score(voting_clf_hard,X_train,y_train,cv=5).mean())

print('voting_clf_soft :',cross_val_score(voting_clf_soft,X_train,y_train,cv=5))
print('voting_clf_soft mean :',cross_val_score(voting_clf_soft,X_train,y_train,cv=5).mean())

print('voting_clf_all :',cross_val_score(voting_clf_all,X_train,y_train,cv=5))
print('voting_clf_all mean :',cross_val_score(voting_clf_all,X_train,y_train,cv=5).mean())



In [None]:
#Doing the weighting to asses the quality of our data and if our weighting is correct

params = {'weights' : [[1,1],[1,2],[2,1]]}

vote_weight = GridSearchCV(voting_clf_soft, param_grid = params, cv = 5, verbose = True, n_jobs = -1)
best_clf_weight = vote_weight.fit(X_trained_scaled,y_train)
clf_performance(best_clf_weight,'VC Weights')
voting_clf_sub = best_clf_weight.best_estimator_.predict(X_test_scaled)


In [None]:
#Fittig the data 
voting_clf_hard.fit(X_trained_scaled, y_train)
voting_clf_soft.fit(X_trained_scaled, y_train)
voting_clf_all.fit(X_trained_scaled, y_train)
best_rf.fit(X_trained_scaled, y_train)

#Make Predictions
y_vc_hard = voting_clf_hard.predict(X_test_scaled).astype(int)
y_rf = best_rf.predict(X_test_scaled).astype(int)
y_soft =  voting_clf_soft.predict(X_test_scaled).astype(int)
y_vc_all = voting_clf_all.predict(X_test_scaled).astype(int)

In [None]:
output_1 = {'PassengerId': test.PassengerId, 'Survived': y_rf}
submission = pd.DataFrame(data = output_1)

output_2 = {'PassengerId': test.PassengerId, 'Survived': y_vc_hard}
submission_1 = pd.DataFrame(data = output_2)

output_3 = {'PassengerId': test.PassengerId, 'Survived': y_soft}
submission_2 = pd.DataFrame(data = output_3)

output_4 = {'PassengerId': test.PassengerId, 'Survived': y_vc_all}
submission_3 = pd.DataFrame(data = output_4)

final_data_comp = {'PassengerId': test.PassengerId, 
                   'Survived_vc_hard': y_vc_hard, 
                   'Survived_rf': y_rf, 
                   'Survived_vc_soft' : y_soft, 
                   'Survived_vc_all' : y_vc_all}

comparison = pd.DataFrame(data=final_data_comp)

In [None]:
#Submitting the files
submission.to_csv('submission_rf.csv', index =False)
submission_1.to_csv('submission_vc_hard.csv',index=False)
submission_2.to_csv('submission_vc_soft.csv', index=False)
submission_3.to_csv('submission_vc_all.csv', index=False)

print("Your submission was successfully saved!")