# import libraries

In [1]:
import pandas as pd
import numpy as np
from datasist.structdata import detect_outliers

from imblearn.over_sampling import SMOTE

from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import itertools
import warnings
warnings.filterwarnings('ignore')

# implement necessary function to help us

In [2]:
def plot_confusion_matrix(cm, classes,normalize=False,title='Confusion matrix',cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.figure(figsize=(6, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    cm = np.round(cm, 2)
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [3]:
def MLPredictAcc(X, y, classes , scale = False , smote = False):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    if (smote == True):
        sampler = SMOTE()
        X_train, y_train = sampler.fit_resample(X_train, y_train)
    if (scale == True) :
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    models = {
        "XGB": XGBClassifier(),
        "KNN": KNeighborsClassifier(),
        "SVC": SVC(),
        "DT": DecisionTreeClassifier(),
        "RF": RandomForestClassifier(),
        "GaussianNB" : GaussianNB(),
        "Perceptron" : Perceptron(),
        "LinearSVC" : LinearSVC(),
        "SGDClassifier" : SGDClassifier(),
        "LogisticRegression" : LogisticRegression()
    }
    modell = []
    modell_acc = []
    model_built = {}
    for name, model in models.items():
        print(f'Training Model {name} \n--------------')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cf = confusion_matrix(y_test, y_pred)
        acc_svc = round(accuracy_score(y_test, y_pred) * 100,2)
        modell.append(name)
        modell_acc.append(acc_svc)
        model_built[name]=model
        plot_confusion_matrix(cf, classes, title='{} cf with acc = {} %'.format(name,acc_svc))
        print('-' * 30)
    models = pd.DataFrame(
        {
            'Model': modell,
            'Score': modell_acc ,

        })
    models = models.sort_values(by='Score', ascending=False)
    models['Score'] = models['Score'].apply(lambda x : str(x) + " %")
    modelss = pd.DataFrame({
        "index ": [p for p in range(1,len(modell_acc)+1)],
         "model" : models['Model'],
         'Score': models['Score'],
    })

    if (scale == True):
        return modelss, model_built , scaler
    else:
        return modelss, model_built


In [4]:
def define_column_type(df):
    numerical_column =  df.select_dtypes(exclude='O').columns.to_list()
    categorical_column = df.select_dtypes(include='O').columns.to_list()
    return numerical_column , categorical_column

In [5]:
def show_value_count_category_column(df ):
    for name in define_column_type(df)[1]:
        df_count = pd.DataFrame(df[name].value_counts())
        print(df_count)
        print("*" * 50)

In [6]:
def visualize_null_count(df):
    plt.figure(figsize=(12,8))
    print(df.isnull().sum())
    sns.heatmap(df.isnull())


In [7]:
def make_encoding_dict(df):
    return dict(tuple(zip(df.value_counts().index.tolist(), [i for i in range (100)])))

In [8]:
def detect_outlier(df,numerical):
    for col in numerical:
        outliers = detect_outliers(df, 0, [col])
        df.drop(outliers, inplace=True)
        print("len outliner in {} = {}".format(col,len(outliers)) )

In [9]:
def fill_missing_numerical_data(df_rows_values):
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit(df_rows_values)
    df_rows_values = imputer.transform(df_rows_values)
    return df_rows_values

In [10]:
def EncodingIndependentVariable(df_rows_values):
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
    df_rows_values = np.array(ct.fit_transform(df_rows_values))
    return  df_rows_values

In [11]:
def EncodingDependentVariable(y_values):
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_values = le.fit_transform(y_values)
    return y_values

# Importing the dataset