**MultiClassModels**

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score


from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [5]:
titanic_df = pd.read_csv("./datasets/titanic_mine.csv")
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,3,0,18.0,0,0,9.8417,0,0,1
1,0,3,1,26.0,0,0,7.775,0,0,1
2,1,3,0,4.0,0,2,22.025,0,0,1
3,0,3,1,21.0,0,0,16.1,0,0,1
4,0,1,1,58.0,0,2,113.275,1,0,0


In [6]:
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [7]:
# dict to hold score of each model on train and test dataset
result_dict = {}


# helper function that ret a dict of metrics score based on prediction and actual target passed
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)   # since norm = True, acc in term of fraction
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {"accuracy": acc,
           "precision": prec,
           "recall": recall,
           "accuracy_count": num_acc}


# another helper fun to build the model, classfier_fn is classifier fn created by me
def build_model(classifier_fn, name_of_y_col, name_of_x_cols, dataset, test_frac=0.1):
    X = dataset[name_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = test_frac)
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    train_sumary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({"y_test": y_test,
                                "y_pred": y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    return {
        "training": train_sumary,
        "test": test_summary,
        "confusion_matrix": model_crosstab
    }
    

def compare_results():
    """
    fun compares result of the diff clf models built
    """
    for key in result_dict:
        print("Classification: ", key)
        
        print()
        print("Training data")
        # gets info about model on training data
        for score in result_dict[key]["training"]:
            print(score, result_dict[key]["training"][score])
            
        print()
        print("Test data")
        for score in result_dict[key]["test"]:
            print(score, result_dict[key]["test"][score])
            
        print()

In [8]:
# log reg fn
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver="liblinear")
    model.fit(x_train, y_train)
    
    return model

In [9]:
result_dict["survived - logistic"] = build_model(logistic_fn, "Survived", FEATURES, titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7921875
precision 0.7777777777777778
recall 0.6920152091254753
accuracy_count 507

Test data
accuracy 0.8194444444444444
precision 0.7142857142857143
recall 0.8
accuracy_count 59



In [10]:
def linear_discriminant_fn(x_train, y_train, solver = "svd"):
    model = LinearDiscriminantAnalysis(solver= solver)
    model.fit(x_train, y_train)
    
    return model

In [12]:
result_dict["survived - linear_discriminant"] = build_model(linear_discriminant_fn, 
                                                            "Survived", 
                                                            # done to avoid dummy trap, droping one one hot encoded col
                                                            FEATURES[0:-1], 
                                                            titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7921875
precision 0.7777777777777778
recall 0.6920152091254753
accuracy_count 507

Test data
accuracy 0.8194444444444444
precision 0.7142857142857143
recall 0.8
accuracy_count 59

Classification:  survived - linear_discriminant

Training data
accuracy 0.7984375
precision 0.7586206896551724
recall 0.7068273092369478
accuracy_count 511

Test data
accuracy 0.7916666666666666
precision 0.9
recall 0.6923076923076923
accuracy_count 57



In [13]:
# quadratic discriminant analyis
# used only when covariance are diff for X for diff Y values
def quadratic_discriminant_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [15]:
result_dict["survived - quadratic_discriminant"] = build_model(quadratic_discriminant_fn, 
                                                            "Survived", 
                                                            # done to avoid dummy trap, droping one one hot encoded col
                                                            FEATURES[0:-1], 
                                                            titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7921875
precision 0.7777777777777778
recall 0.6920152091254753
accuracy_count 507

Test data
accuracy 0.8194444444444444
precision 0.7142857142857143
recall 0.8
accuracy_count 59

Classification:  survived - linear_discriminant

Training data
accuracy 0.7984375
precision 0.7586206896551724
recall 0.7068273092369478
accuracy_count 511

Test data
accuracy 0.7916666666666666
precision 0.9
recall 0.6923076923076923
accuracy_count 57

Classification:  survived - quadratic_discriminant

Training data
accuracy 0.7921875
precision 0.7735042735042735
recall 0.6934865900383141
accuracy_count 507

Test data
accuracy 0.8611111111111112
precision 0.8148148148148148
recall 0.8148148148148148
accuracy_count 62



In [26]:
def sgd_fn(x_train, y_train, max_iter = 10000, tol=1e-3):
    model = SGDClassifier(max_iter=max_iter, tol = tol)
    model.fit(x_train, y_train)
    return model

In [27]:
result_dict["survived - SGD"] = build_model(sgd_fn, 
                                                            "Survived", 
                                                            # done to avoid dummy trap, droping one one hot encoded col
                                                            FEATURES[0:-1], 
                                                            titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7921875
precision 0.7777777777777778
recall 0.6920152091254753
accuracy_count 507

Test data
accuracy 0.8194444444444444
precision 0.7142857142857143
recall 0.8
accuracy_count 59

Classification:  survived - linear_discriminant

Training data
accuracy 0.7984375
precision 0.7586206896551724
recall 0.7068273092369478
accuracy_count 511

Test data
accuracy 0.7916666666666666
precision 0.9
recall 0.6923076923076923
accuracy_count 57

Classification:  survived - quadratic_discriminant

Training data
accuracy 0.7921875
precision 0.7735042735042735
recall 0.6934865900383141
accuracy_count 507

Test data
accuracy 0.8611111111111112
precision 0.8148148148148148
recall 0.8148148148148148
accuracy_count 62

Classification:  survived - SGD

Training data
accuracy 0.734375
precision 0.6282420749279539
recall 0.8416988416988417
accuracy_count 470

Test data
accuracy 0.7777777777777778
precision 0.696969696969697
recall 0.7931034482758621

In [30]:
def linear_svc_fn(x_train, y_train, C=1.0, max_iter = 1000, tol=1e-3):
    model = LinearSVC(C = C, max_iter=max_iter, tol = tol, dual = False)
    model.fit(x_train, y_train)
    return model

In [31]:
result_dict["survived - linear_svc"] = build_model(linear_svc_fn, 
                                                            "Survived", 
                                                            # done to avoid dummy trap, droping one one hot encoded col
                                                            FEATURES[0:-1], 
                                                            titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7921875
precision 0.7777777777777778
recall 0.6920152091254753
accuracy_count 507

Test data
accuracy 0.8194444444444444
precision 0.7142857142857143
recall 0.8
accuracy_count 59

Classification:  survived - linear_discriminant

Training data
accuracy 0.7984375
precision 0.7586206896551724
recall 0.7068273092369478
accuracy_count 511

Test data
accuracy 0.7916666666666666
precision 0.9
recall 0.6923076923076923
accuracy_count 57

Classification:  survived - quadratic_discriminant

Training data
accuracy 0.7921875
precision 0.7735042735042735
recall 0.6934865900383141
accuracy_count 507

Test data
accuracy 0.8611111111111112
precision 0.8148148148148148
recall 0.8148148148148148
accuracy_count 62

Classification:  survived - SGD

Training data
accuracy 0.734375
precision 0.6282420749279539
recall 0.8416988416988417
accuracy_count 470

Test data
accuracy 0.7777777777777778
precision 0.696969696969697
recall 0.7931034482758621

In [34]:
# Raidius neighbours clf
def radius_neighbor_fn(x_train, y_train, radius = 30.0):
    model = RadiusNeighborsClassifier(radius = radius)
    model.fit(x_train, y_train)
    return model

In [35]:
result_dict["survived - radius_neighbors"] = build_model(radius_neighbor_fn, 
                                                            "Survived", 
                                                            # done to avoid dummy trap, droping one one hot encoded col
                                                            FEATURES[0:-1], 
                                                            titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7921875
precision 0.7777777777777778
recall 0.6920152091254753
accuracy_count 507

Test data
accuracy 0.8194444444444444
precision 0.7142857142857143
recall 0.8
accuracy_count 59

Classification:  survived - linear_discriminant

Training data
accuracy 0.7984375
precision 0.7586206896551724
recall 0.7068273092369478
accuracy_count 511

Test data
accuracy 0.7916666666666666
precision 0.9
recall 0.6923076923076923
accuracy_count 57

Classification:  survived - quadratic_discriminant

Training data
accuracy 0.7921875
precision 0.7735042735042735
recall 0.6934865900383141
accuracy_count 507

Test data
accuracy 0.8611111111111112
precision 0.8148148148148148
recall 0.8148148148148148
accuracy_count 62

Classification:  survived - SGD

Training data
accuracy 0.734375
precision 0.6282420749279539
recall 0.8416988416988417
accuracy_count 470

Test data
accuracy 0.7777777777777778
precision 0.696969696969697
recall 0.7931034482758621

In [40]:
def decision_tree_fn(x_train, y_train, max_depth=3, max_features=None):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    return model

In [41]:
result_dict["survived - decision tree"] = build_model(decision_tree_fn, 
                                                            "Survived", 
                                                            # done to avoid dummy trap, droping one one hot encoded col
                                                            FEATURES[0:-1], 
                                                            titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7921875
precision 0.7777777777777778
recall 0.6920152091254753
accuracy_count 507

Test data
accuracy 0.8194444444444444
precision 0.7142857142857143
recall 0.8
accuracy_count 59

Classification:  survived - linear_discriminant

Training data
accuracy 0.7984375
precision 0.7586206896551724
recall 0.7068273092369478
accuracy_count 511

Test data
accuracy 0.7916666666666666
precision 0.9
recall 0.6923076923076923
accuracy_count 57

Classification:  survived - quadratic_discriminant

Training data
accuracy 0.7921875
precision 0.7735042735042735
recall 0.6934865900383141
accuracy_count 507

Test data
accuracy 0.8611111111111112
precision 0.8148148148148148
recall 0.8148148148148148
accuracy_count 62

Classification:  survived - SGD

Training data
accuracy 0.734375
precision 0.6282420749279539
recall 0.8416988416988417
accuracy_count 470

Test data
accuracy 0.7777777777777778
precision 0.696969696969697
recall 0.7931034482758621

In [42]:
# naive bayes
def naive_bayes_fn(x_train, y_train, priors = None):
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    return model

In [43]:
result_dict["survived - naive bayes"] = build_model(naive_bayes_fn, 
                                                            "Survived", 
                                                            # done to avoid dummy trap, droping one one hot encoded col
                                                            FEATURES[0:-1], 
                                                            titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7921875
precision 0.7777777777777778
recall 0.6920152091254753
accuracy_count 507

Test data
accuracy 0.8194444444444444
precision 0.7142857142857143
recall 0.8
accuracy_count 59

Classification:  survived - linear_discriminant

Training data
accuracy 0.7984375
precision 0.7586206896551724
recall 0.7068273092369478
accuracy_count 511

Test data
accuracy 0.7916666666666666
precision 0.9
recall 0.6923076923076923
accuracy_count 57

Classification:  survived - quadratic_discriminant

Training data
accuracy 0.7921875
precision 0.7735042735042735
recall 0.6934865900383141
accuracy_count 507

Test data
accuracy 0.8611111111111112
precision 0.8148148148148148
recall 0.8148148148148148
accuracy_count 62

Classification:  survived - SGD

Training data
accuracy 0.734375
precision 0.6282420749279539
recall 0.8416988416988417
accuracy_count 470

Test data
accuracy 0.7777777777777778
precision 0.696969696969697
recall 0.7931034482758621