# Import libraries 

In [2]:
#pip install -U scikit-learn

In [3]:
import numpy as np
import pandas as pd 

# Read the Data

In [4]:
file_path = 'C:/Users/Maryam/Desktop/Job applications/Portfolio/DataSets/classification_data.csv'
dataset = pd.read_csv(file_path)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
 

# k-Fold Cross Validation 

In [5]:
def cross_validation(classifier, x_train, y_train):
    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(estimator = classifier, X=x_train, y=y_train, cv = 10)
    accuracies_mean = accuracies.mean()
    accuracies_std = accuracies.std()
    return accuracies_mean, accuracies_std 

# Grid Search 

In [6]:
def parameters_estimation(classifier, parameters, accuracy, x_train, y_train):
    from sklearn.model_selection import GridSearchCV

    # Create a grid search object
    grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring= accuracy, cv=10, n_jobs=-1)

    # Perform the grid search on your training data
    grid_search.fit(x_train, y_train)

    # Get the best hyperparameters and estimator
    best_regressor = grid_search.best_estimator_
    best_param = grid_search.best_params_
    
    # print the best hyperparameters
    print("Best Hyperparameters:", best_param)

    return best_param

# Model Evaluation 

In [7]:
def model_evaluation(updated_classifier, x_test, y_test):
    # to show the number of correct and incorrect prediction 
    # Making the Confusion Matrix 
    from sklearn.metrics import confusion_matrix, accuracy_score
    y_pred = updated_classifier.predict(x_test) 
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    return cm, accuracy

# Methods

## Logistic Regression 

In [8]:
# define a function for a Logistic Regression (LR) method
def Logistic_Regression():
    # create a LR classifier and train it on the training set
    # random state: 
    from sklearn.linear_model import LogisticRegression 
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(x_train, y_train)

    # call the cross_validation function to return the mean and the standard deviation of the accuracies of all sub-sets
    accuracies_mean, accuracies_std = cross_validation(classifier, x_train, y_train)
    
    # call model_evaluation function to return the confusion matrix (cm) and the overall accuracy of the KNN method
    cm, accuracy = model_evaluation(classifier, x_test, y_test)

    # the LR function returns the following values
    return accuracies_mean, accuracies_std, cm, accuracy

accuracies_mean_LR, accuracies_std_LR, cm_LR, accuracy_LR=Logistic_Regression()




## KNN

In [9]:
# define a function for a K-Nearest Kneighbours method
def KNN():
    # create an SVR classifier and train it on the training set
    # n_neighbors:
    # metric
    # p  
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors=5, metric= 'minkowski', p = 2)
    classifier.fit(x_train, y_train)

    # call the cross_validation function to return the mean and the standard deviation of the accuracies of all sub-sets
    accuracies_mean, accuracies_std = cross_validation(classifier, x_train, y_train)

    # define the parameters to be optimized in order to improve the accuracy of the method. 
    parameters = {'n_neighbors': [1, 5, 10, 20]}

    # call parameters_estimation function to return the best value of the parameters
    best_param = parameters_estimation(classifier, parameters, 'accuracy', x_train, y_train)
    best_n_neighbors = best_param['n_neighbors']

    # update the classifier using the best parameters return from parameters_estimation function
    updated_classifier = KNeighborsClassifier(n_neighbors=best_n_neighbors, metric= 'minkowski', p = 2)
    updated_classifier.fit(x_train, y_train)

    # call model_evaluation function to return the confusion matrix (cm) and the overall accuracy of the KNN method 
    cm, accuracy = model_evaluation(updated_classifier, x_test, y_test)

    # the KNN function returns the following values 
    return accuracies_mean, accuracies_std, cm, accuracy

accuracies_mean_KNN, accuracies_std_KNN, cm_KNN, accuracy_KNN = KNN()


Best Hyperparameters: {'n_neighbors': 20}
0.6133670033670034
0.05355801777628228
[[75 12]
 [34 16]]
0.6642335766423357


## Support Vector Regression (SVR)

In [10]:
# define a function for a Support Vector Regression (SVR) method
def SVR():
    # create an SVR classifier and train it on the training set
    # kernel: 
    from sklearn.svm import SVC 
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(x_train, y_train)
    
    # call the cross_validation function to return the mean and the standard deviation of the accuracies of all sub-sets
    accuracies_mean, accuracies_std = cross_validation(classifier, x_train, y_train)

    # define the parameters to be optimized in order to improve the accuracy of the method
    parameters = {'kernel': ['linear','rbf']} 

    # call parameters_estimation function to return the best value of the parameters 
    best_param = parameters_estimation(classifier, parameters, 'accuracy', x_train, y_train)
    best_kernel = best_param['kernel']

    # update the classifier using the best parameters return from parameters_estimation function
    updated_classifier = SVC(kernel=best_kernel, random_state=0)
    updated_classifier.fit(x_train, y_train)

    # call model_evaluation function to return the confusion matrix (cm) and the overall accuracy of the SVR method 
    cm, accuracy = model_evaluation(updated_classifier, x_test, y_test)

    # the SVR function returns the following values 
    return accuracies_mean, accuracies_std, cm, accuracy

accuracies_mean_SVR, accuracies_std_SVR, cm_SVR, accuracy_SVR = SVR()


## Decision Tree 

In [None]:
# define a function for a Decision Tree method
def Decision_Tree():
    # create a Decision Tree classifier and train it on the training set
    # criterion: 
    # random_state: 
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(x_train, y_train)

    # call cross_validation function to return the mean and the standard deviation of the accuracies of all sub-sets
    accuracies_mean, accuracies_std = cross_validation(classifier, x_train, y_train)

    # define the parameters to be optimized in order to improve the accuracy of the method 
    parameters = {'criterion': ['gini', 'entropy']}
    
    # call parameters_estimation function to return the best value of the parameters 
    best_param = parameters_estimation(classifier, parameters, 'accuracy', x_train, y_train)

    # update the classifier using the best parameters return from parameters_estimation function
    updated_classifier = DecisionTreeClassifier(criterion=best_param['criterion'], random_state=0)
    updated_classifier.fit(x_train, y_train)

    # call model_evaluation function to return the confusion matrix (cm) and the overall accuracy of the Decision Tree method 
    cm, accuracy = model_evaluation(updated_classifier, x_test, y_test)

    # the Decision Tree function returns the following values 
    return accuracies_mean, accuracies_std, cm, accuracy

accuracies_mean_DT, accuracies_std_DT, cm_DT, accuracy_DT = Decision_Tree()


NameError: name 'x_train' is not defined

## Random Forest Model

In [None]:
# define a function for a Random Forest method
def Random_Forest():
    # create a Random Forest classifier and train it on the training set
    # n_estimators
    # criterion:
    # random_state: 
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(x_train, y_train)

    # call cross_validation function to return the mean and the standard deviation of the accuracies of all sub-sets
    accuracies_mean, accuracies_std = cross_validation(classifier, x_train, y_train)
    
    # define the parameters to be optimized in order to improve the accuracy of the method.  
    parameters = {'criterion': ['gini', 'entropy']}

    # call parameters_estimation function to return the best value of the parameters 
    best_param = parameters_estimation(classifier, parameters, 'accuracy', x_train, y_train)

    # update the classifier using the best parameters return from parameters_estimation function  
    updated_classifier = RandomForestClassifier(n_estimators = 10, criterion=best_param['criterion'], random_state=0)
    updated_classifier.fit(x_train, y_train)

    # call model_evaluation function to return the confusion matrix and the overall accuracy of the Random Forest method 
    cm, accuracy = model_evaluation(updated_classifier, x_test, y_test)

    # the Random Forest function returns the following values 
    return accuracies_mean, accuracies_std, cm, accuracy

accuracies_mean_RF, accuracies_std_RF, cm_RF, accuracy_RF = Random_Forest()

## XGBoost 

In [None]:
# define a function for XGBoost method 
def XGBoost_method():

   # create an XGBoost classifier and train it using the training set 
   from xgboost import XGBRegressor
   classifier = XGBClassifier()
   classifier.fit(x_train, y_train)

   # call cross_validation function to return the mean and the standard deviation of the accuracies of all sub-sets
   accuracies_mean, accuracies_std = cross_validation(classifier, x_train, y_train)

   # call model_evaluation function to return the confusion matrix and the overall accuracy of the XGBoost method 
   cm, accuracy = model_evaluation(classifier, x_test, y_test)

   # the XGBoost function returns the following values 
   return accuracies_mean, accuracies_std, cm, accuracy

accuracies_mean_XG, accuracies_std_XG, cm_XG, accuracy_XG = XGBoost_method()

# Models Ranking and Selection

In [None]:
# to select the best model, the model functions are called to return the performance measures that are used to rank all the models from the most accurate to the least accurate.
# the measures are: the average mean of the cross validation accuracy, the standard deviation of the cross validation accuracy, the confusion materix and the overall accuracy
accuracies_mean_LR, accuracies_std_LR, cm_LR, accuracy_LR = Logistic_Regression() 
accuracies_mean_KNN, accuracies_std_KNN, cm_KNN, accuracy_KNN = KNN()
accuracies_mean_SVR, accuracies_std_SVR, cm_SVR, accuracy_SVR = SVR()
accuracies_mean_DT, accuracies_std_DT, cm_DT, accuracy_DT = Decision_Tree()
accuracies_mean_RF, accuracies_std_RF, cm_RF, accuracy_RF = Random_Forest()
ccuracies_mean_XG, accuracies_std_XG, cm_XG, accuracy_XG = XGBoost_method()

# after running all the models, the performance measures are stored in dictonaries in order to be ranked    
accuracies_mean_dict = {'LR': accuracies_mean_LR, 'KNN': accuracies_mean_KNN, 'SVR': accuracies_mean_SVR, 'DT': accuracies_mean_DT, 'RF': accuracies_mean_RF, 'XG': accuracies_mean_XG}
accuracies_std_dict = {'LR': accuracies_std_LR, 'KNN': accuracies_std_KNN, 'SVR': accuracies_std_SVR, 'DT': accuracies_std_DT, 'RF': accuracies_std_RF, 'XG': accuracies_std_XG}
cm_dict = {'LR': cm_LR, 'KNN': cm_KNN, 'SVR': cm_SVR, 'DT': cm_DT, 'RF': cm_RF, 'XG': cm_XG}
accuracy_dict = {'LR': accuracy_LR, 'KNN': accuracy_KNN, 'SVR': accuracy_SVR, 'DT': accuracy_DT, 'RF': accuracy_RF, 'XG': accuracy_XG}

# the dictories are ranked from the highest to the lowest. 
accuracies_mean_dict.sort(reverse=True)
accuracies_std_dict.sort(reverse=True)
cm_dict.sort(reverse=True)
accuracy_dict.sort(reverse=True)

# the first model in each dictonary is the best one 
print("Sorted cross-validation mean accuracies are:", accuracies_mean_dict)
print("Sorted standard deviation of the cross-validation accuracies are:", accuracies_std_dict)
print("Sorted Confusion Matrix are:", cm_dict)
print("Sorted overall accuracy are:", accuracy_dict)