In [1]:
### Import packages
# linear algebra
import numpy as np 
# data processing
import pandas as pd 
from sklearn.impute import SimpleImputer
# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style
# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
df=pd.read_csv("train_df.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Age         891 non-null    float64
 3   SibSp       891 non-null    int64  
 4   Parch       891 non-null    int64  
 5   Fare        891 non-null    float64
 6   not_alone   891 non-null    int64  
 7   Sex_female  891 non-null    int64  
 8   Embarked_Q  891 non-null    int64  
 9   Embarked_S  891 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 69.7 KB


In [3]:
### Split the data - By default, train test split splits the data into 75% training data and 25% test data, which is a good rule of thumb (##or test_size= )
# features
from sklearn.model_selection import train_test_split
def split_set(df, size):
    """
    df: Dataframe to split
    size: proportion to data to allocate to validation set (same as train_test_split's test_size)
    """
    return train_test_split(df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'not_alone','Sex_female','Embarked_Q','Embarked_S']], df['Survived'], test_size=size)

X_train, X_test, y_train, y_test = split_set(df,.3)
X_train.shape, X_test.shape

((623, 9), (268, 9))

In [4]:
def predict_kNN(dataset, min_neighbors= 1, max_neighbors = 15, cross_val_folds=5):
    """
    k-NN algoritm and finding the best parameters for it
    """
    param_grid = {'n_neighbors': np.arange(min_neighbors, max_neighbors)}
    knn = KNeighborsClassifier()
    knn_cv = GridSearchCV(knn, param_grid, cv=cross_val_folds)
    knn_cv.fit(X_train, y_train)
    y_pred = knn_cv.predict(X_test)
    print(knn_cv.best_params_)
    print(knn_cv.best_score_)
    print(classification_report(y_test,y_pred))
    knn_classreport= classification_report(y_test,y_pred)
    return knn_classreport

In [7]:
predict_kNN(df, 1, 15, 5)

{'n_neighbors': 5}
0.6854838709677419
              precision    recall  f1-score   support

           0       0.79      0.78      0.78       169
           1       0.63      0.66      0.64        99

    accuracy                           0.73       268
   macro avg       0.71      0.72      0.71       268
weighted avg       0.73      0.73      0.73       268



'              precision    recall  f1-score   support\n\n           0       0.79      0.78      0.78       169\n           1       0.63      0.66      0.64        99\n\n    accuracy                           0.73       268\n   macro avg       0.71      0.72      0.71       268\nweighted avg       0.73      0.73      0.73       268\n'

In [9]:
def predict_logreg(dataset):
    """Logistic Regression algorithm"""
    logreg=LogisticRegression()
    logreg.fit(X_train, y_train)
    y_pred=logreg.predict(X_test)
    print(classification_report(y_test,y_pred))
    logreg_classresport=classification_report(y_test,y_pred)
    return log_classreport

In [None]:
predict_logreg(df)

In [13]:
### RANDOM FOREST - GridSearch
def predict_RF(dataset):
    """Random Forest  - finding the best hyperparameters"""
    rf=RandomForestClassifier()
    params_rf={'criterion':['gini','entropy'],'n_estimators':[100,200], 'max_depth':[3,4,5], 'min_samples_leaf':[1,2,3,5],'min_samples_split':[2,3,4,5], 'max_features':['auto','log2']}
    grid_rf=GridSearchCV(estimator=rf, param_grid=params_rf,cv=10, n_jobs=-1)
    grid_rf.fit(X_train, y_train)
    best_hyper_rf=grid_rf.best_params_
    print('Best hyperparameters\n', best_hyper_rf)
    print('Best Score: %s' % grid_rf.best_score_)
    return best_hyper_rf

In [14]:
predict_RF(df)

Best hyperparameters
 {'criterion': 'entropy', 'max_depth': 3, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 100}
Best Score: 0.8329493087557603


{'criterion': 'entropy',
 'max_depth': 3,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 5,
 'n_estimators': 100}

In [16]:
### RandomForest-Bestmodel
def bestRF(dataset):
    """Running Random Forest algorithm with the best hyperparameters"""
    rf_bestmodel=RandomForestClassifier(criterion='entropy', max_depth=3, max_features='auto', min_samples_leaf=3, min_samples_split=5, n_estimators=100, class_weight='balanced')
    rf_bestmodel.fit(X_train,y_train)
    y_bestpred=rf_bestmodel.predict(X_test)
    rf_classreport=classification_report(y_test,y_bestpred)
    return rf_classreport

In [18]:
print(bestRF(df))

              precision    recall  f1-score   support

           0       0.83      0.81      0.82       169
           1       0.69      0.72      0.70        99

    accuracy                           0.78       268
   macro avg       0.76      0.76      0.76       268
weighted avg       0.78      0.78      0.78       268

