# In this we will convert categorical data into continuous data, which will definetly improve model performance.

In [None]:
import numpy as np
import pandas as pd
from Binary_Logistic_Regression import BinaryLogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('mushrooms.csv')

In [None]:
data

In [None]:
def one_hot_encoding(df):
    data_array = list()
    column_name = list()
    for column in df.columns:
        unique = df[column].unique()
        size = len(df[column].unique())
        df[column].replace(to_replace=unique, value=range(size), inplace=True)
        data_array.append(np.eye(size, size)[df[column]])
        for u in unique:
            column_name.append(column + '_' + str(u))
    return pd.DataFrame(np.concatenate(data_array, axis=1), columns=column_name)

In [None]:
new_data = one_hot_encoding(data.iloc[:,1:])

In [None]:
new_data = (new_data - new_data.mean(axis=0))/new_data.std(axis=0)

In [None]:
new_data.dropna(axis=1, inplace=True)

In [None]:
new_data

In [None]:
data['class'].replace(to_replace=['p', 'e'],value=[1,0],inplace=True)

In [None]:
class BinaryLR_Mushroom(BinaryLogisticRegression):
    def __init__(self, X, Y, n_comp, LR=10**(-6), ET=10**(-5)):
        
        pca = PCA(n_components=n_comp)
        self.data_after_pca = pd.DataFrame(pca.fit_transform(X))
        
        self.data_after_pca['class'] = Y
        
        p = self.data_after_pca[self.data_after_pca['class'] == 1]
        e = self.data_after_pca[self.data_after_pca['class'] == 0]
        
        self.train = pd.concat([p.iloc[:int(X.shape[0] * 0.7)//2,:], e.iloc[:int(X.shape[0] * 0.7)//2,:]], axis=0)
        
        remaining = pd.concat([p.iloc[int(X.shape[0] * 0.7)//2:,:], e.iloc[int(X.shape[0] * 0.7)//2:,:]])
        
        X_cv, X_test, Y_cv, Y_test = train_test_split(remaining.iloc[:,:-1],remaining['class'],test_size=1/2)
        
        self.cv = pd.DataFrame(X_cv)
        self.cv['class'] = Y_cv
        
        self.test = pd.DataFrame(X_test)
        self.test['class'] = Y_test
        
        del pca, p, e, remaining, X_cv, X_test, Y_cv, Y_test
        
        super().__init__(1, LearningRate=LR, ErrorTolerance=ET)

In [None]:
def evaluate(predicted, actual):
    TP = np.count_nonzero((predicted == 1) & (actual == 1))
    TN = np.count_nonzero((predicted == 0) & (actual == 0))
    FP = np.count_nonzero((predicted == 1) & (actual == 0))
    FN = np.count_nonzero((predicted == 0) & (actual == 1))
    
    
    if (TP + TN + FP + FN) == 0:
        accuracy = 0
    else:
        accuracy = (TP + TN)/(TP + TN + FP + FN)
    
    if (TP + FP) == 0:
        precision = 0
    else:
        precision = TP/(TP + FP)
    
    if (TP + FN) == 0:
        recall = 0
    else:
        recall = TP/(TP + FN)
        
    if (precision + recall) == 0:
        f1_score = 0
    else:
        f1_score = (2 * precision * recall)/(precision + recall)
    
    return (accuracy, precision, recall, f1_score)

In [None]:
Results = dict()
best = {'Accuracy':0, 'Precision':0, 'Recall':0, 'F1 Score':0, 'Best Obj':'obj', 'Parameter':()}

# Trying Different Combinaitions of Hyperparameters

In [None]:
for n_comp in range(2,36):
    for LR in [5,6,7,8,9,10]:
        for ET in [5,6,7,8,9,10]:
            obj = BinaryLR_Mushroom(new_data, data['class'], n_comp, LR=10**(-LR), ET=10**(-ET))
            obj.fit(obj.train.iloc[:,:-1], obj.train['class'])
            print('---------------------------------------------------------------------------')
            obj.predict(obj.cv.iloc[:,:-1])
            Results[(n_comp, LR, ET)] = evaluate(obj.predicted_labels, obj.cv['class'])
            if (Results[(n_comp, LR, ET)][0] + Results[(n_comp, LR, ET)][1] + Results[(n_comp, LR, ET)][2]) > (best['Accuracy'] + best['Precision'] + best['Recall']):
                best['Accuracy'] = Results[(n_comp, LR, ET)][0]
                best['Precision'] = Results[(n_comp, LR, ET)][1]
                best['Recall'] = Results[(n_comp, LR, ET)][2]
                best['F1 Score'] = Results[(n_comp, LR, ET)][3]
                best['Best Obj'] = obj
                best['Parameter'] = (n_comp, LR, ET)
            else:
                del obj

# From over 1260 Hyperparameter combinaitons we find our best model

In [None]:
best_obj = best['Best Obj']

In [None]:
best_obj.predict(best_obj.test_data.iloc[:,:-1])

In [None]:
evaluate(best_obj.predicted_labels, best_obj.test_data['class'])