<a href="https://colab.research.google.com/github/KevinHern/AI-Crash-Course/blob/main/AI_Crash_Course_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification Algorithms

[Presentation: AI Crash Course 03](https://view.genial.ly/6197f155542e220dab592309/presentation-ai-crashcourse03)

## 0) Preparations

In [19]:
# ----- Libraries ----- #

# For graph plotting
import matplotlib.pyplot as plt

# For dataset manipulation
import pandas as pd
from sklearn.model_selection import train_test_split

# For visualizing more complex maps
import seaborn as sns

# For statistical analysis and Models
import statsmodels.api as sm
import statsmodels.formula.api as smapi
import numpy as np


In [None]:
'''
All the information regarding the dataset used for this demo can be found in the following link:
https://www.kaggle.com/uciml/pima-indians-diabetes-database
'''

# Getting Dataset
!wget https://drive.google.com/drive/u/0/folders/1fY_cIKVli5NbKI0mcBJGsti3Li8JvDqh

In [None]:
# Loading Dataset and have a glimpse about it
raw_dataset = pd.read_csv("diabetes.csv")

# Brief Statistical Summary of the dataset
raw_dataset.describe()

In [None]:
# Lets check columns
raw_dataset.columns

In [None]:
# Summary of the dataset
raw_dataset.head()

In [None]:
# Returns a form of (# rows, # columns)
raw_dataset.shape

In [None]:
# Lets make a copy
new_dataset = raw_dataset.copy()

# Lets check for null values
# df.dropna()
print(new_dataset.isna().sum())

# Dropping null rows
new_dataset = new_dataset.dropna()

# Checking new dataset
new_dataset.head()

In [None]:
# Lets visualize the data
sns.pairplot(new_dataset[["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]], diag_kind="kde")

In [39]:
# Splitting dataset into training and testing
train, test = train_test_split(new_dataset, test_size=0.2)

# Sepparating train set
variables = list(raw_dataset.columns)
variables.remove('Outcome')
train_vars = train[variables]

train_target = train[['Outcome']]

## 1) Logistic Regression

In [21]:
def stepwise_selection(dataset, target, significance_level = 0.05, debug=False):
  # Setting up everything
  independent_variables = list(dataset.columns)
  independent_variables.remove(target)
  formula = target + "~ "
  max_iterations = len(independent_variables)

  # Begin algorithm
  for iteration in range(max_iterations):

    # Testing for each independent variable
    best_var = None
    max_t_value = 0
    for exog_variable in independent_variables:
      model = None
      if iteration == 0:
        model = smapi.ols(formula=formula + exog_variable, data=dataset).fit()
      else:
        model = smapi.ols(formula=formula + "+" + exog_variable, data=dataset).fit()

      if debug:
        print(model.summary())

      # Checking T and P Values
      if model.pvalues[exog_variable] <= significance_level:
        if abs(model.tvalues[exog_variable]) > max_t_value:
          best_var = exog_variable
          max_t_value = abs(model.tvalues[exog_variable])
      else:
        continue
    
    # If a variable with significance was found, then add to formula, else, stop algorithm
    if best_var is not None:
      if iteration == 0:
        formula += best_var
      else:
        formula += "+" + best_var
      independent_variables.remove(best_var)
    else:
      break

  return smapi.ols(formula=formula, data=dataset).fit()

In [None]:
# Lets try to predict MPG based on Weight
logistic_regression = stepwise_selection(dataset = train, target = 'Outcome')
print(logistic_regression.summary())

## 2) Decision Tree

In [57]:
from sklearn import tree

dt_model = tree.DecisionTreeClassifier()
dt_model = dt_model.fit(train_vars, train_target)

## 3) Support Vector Machine

In [None]:
from sklearn import svm

svm_model = svm.SVC()
svm_model = svm_model.fit(train_vars, train_target)

## 4) Confusion Matrices

In [47]:
# Importing confusion matrix
from tensorflow.math import confusion_matrix

# Function that plots confusion matrix
def plot_confusion_matrix(labels, predictions):
  figure = plt.figure(figsize=(4, 4))
  sns.heatmap(confusion_matrix(labels=labels, predictions=predictions), annot=True,cmap=plt.cm.Blues)
  plt.tight_layout()
  plt.ylabel('True label')
  plt.xlabel('Predicted label')
  plt.show()

### Logistic Regression

In [None]:
threshold = 0.5

lr_predictions = logistic_regression.predict(test)
lr_predictions = list(map(lambda x: 1 if x > threshold else 0, lr_predictions))

plot_confusion_matrix(labels=test[['Outcome']], predictions=lr_predictions)

### Decision Tree

In [None]:
dt_predictions = dt_model.predict(test[variables])

plot_confusion_matrix(labels=test[['Outcome']], predictions=dt_predictions)

### Support Vector Machine

In [None]:
svm_predictions = svm_model.predict(test[variables])

plot_confusion_matrix(labels=test[['Outcome']], predictions=svm_predictions)