In [1]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import config
import pandas as pd
from sklearn.preprocessing import StandardScaler

train = pd.read_csv(config.preprocessed_train)
test = pd.read_csv(config.preprocessed_test)

#Assigning independent and dependent variables
X_train = train.drop(["Survived"], axis=1)
y_train = train["Survived"]
    #Assigning independent variable 
X_test = test.drop(["PassengerId"], axis=1)
    #Scaling X test and X train
scaler  = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test) 

    #making a dict of classifiers
classifiers = {
    "KNN": KNeighborsClassifier(), 
    "LR": LogisticRegression(max_iter=1000), 
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC(),
    "MLP": MLPClassifier(max_iter=1000),
    "XGB": XGBClassifier(),
    "LGBM": LGBMClassifier()
}

results = pd.DataFrame(columns=["Classifier", "Avg_Accuracy", "Avg_F1_Score"])
for name, clf in classifiers.items():
    model = clf
    cv_results = cross_validate(
        model, X_train, y_train, cv=10,
        scoring=(['accuracy', 'f1'])
    )

    results = results.append({
        "Classifier": name,
        "Avg_Accuracy": cv_results['test_accuracy'].mean(),
        "Avg_F1_Score": cv_results['test_f1'].mean()
    }, ignore_index=True)
    
results["Avg_Overall"] = (results["Avg_Accuracy"] + results["Avg_F1_Score"]) / 2
results = results.sort_values("Avg_Overall", ascending=False)
results

















Unnamed: 0,Classifier,Avg_Accuracy,Avg_F1_Score,Avg_Overall
7,LGBM,0.824981,0.761969,0.793475
3,RF,0.816005,0.755961,0.785983
0,KNN,0.814881,0.745699,0.78029
6,XGB,0.812597,0.747263,0.77993
5,MLP,0.81372,0.738598,0.776159
4,SVM,0.817079,0.731078,0.774078
1,LR,0.801373,0.727539,0.764456
2,DT,0.784582,0.713206,0.748894


#In the previous cell we come to the conclusion that the best performing algorithm is LightGBM
The next step is to tune the hyperparameter

In [None]:
from sklearn.model_selection import GridSearchCV
lgb = LGBMClassifier()

#define parameters
parameters = {'num_leaves':[20, 40, 60, 80, 100], 
              'min_child_samples':[5, 10, 15],
              'max_depth':[-1, 5, 10, 20],
              'learning_rate':[0.05, 0.1, 0.2],
              'reg_alpha':[0, 0.01, 0.03]}

#Define the scoring
clf = GridSearchCV(lgb, parameters, scoring='accuracy')
clf.fit(X=X_train, y=y_train)
print(clf.best_params_)
predicted = clf.predict(X_test)

