## 1) Importing the Libraries

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

## 2) Data Analysis

In [None]:
data = pd.read_csv("")

In [None]:
print(data.head())
print(data.dtypes)
print(data.columns)
print(data.shape)

In [None]:
# Remove duplicate rows - Avoid Peeking 
data.drop_duplicates(inplace = True)

In [None]:
print(data.shape)

In [None]:
# Find how many unique() and nunique() there are in each column
for col in data.columns:
  print(f"Column: {col} - {data[col].nunique()} unique values")
  print(data[col].unique())
  print("-----------------")

### Operation on Pandas Dataframe

In [None]:
"""
axis=0 -> row(default), axis=1 -> column

### Access all rows of a particular column
data.Outcome OR data['Outcome']

### Using loc[row,col] and iloc[row,col]

a) Print single value 
print(data.loc[2, 'BloodPressure'])
print(data.iloc[2, 2])

b) Print multiple rows
print(data.loc[1:4, 'Glucose'])
print(data.iloc[1:5,1]) # Last value is excluded during range in iloc

c) Print all rows
print(data.loc[:, 'Glucose'])

d) Print multiple rows of multiple columns 
print(data.loc[1:4 ,['Glucose','BloodPressure','SkinThickness']])
print(data.iloc[1:5, 1:4]) # Last value is excluded during range in iloc

e) Count the occurance of each unique value 
print(data['Pregnancies'].value_counts())
"""

## 3) Data Preprocessing 

In [None]:
# Since Data Preprocessing is HUGE with a lot of methods
## Better to create another separate Template just for Data Preprocessing

## 4) Split the dataset into Features and Label 

In [None]:
# X -> Features, y -> Label
# .values -> convert to numpy array

X = data.iloc[:, :-1].values # Take all rows of all columns other than last column(Label)
y = data.iloc[:, -1].values # Take all rows of Last column(Label)

In [None]:
"""
### Other methods to split data into features and label

from b-c, in this cases the split remains as a pandas dataframe, 
u can convert them into numpy arrays later by using np.array()

b)
X = data.drop("Outcome", axis=1)
y = data["Outcome"] # Outcome for e.g. is name of Label column

c)
ncol = data.shape[1]
X = data.iloc[:, 0:ncol-1]
y = data["Outcome"]

d)
X = data.iloc[:, 0:8]
y = data["Outcome"]

e)
X = data.iloc[:, data.columns!="Outcome"]
y = data["Outcome"]
"""

## 5) Train Test Split of dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=, random_state=0) 
# or the opposite train_size

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

### with Stratification 

In [None]:
from sklearn.model_selection import train_test_split
S_X_train, S_X_test, S_y_train, S_y_test = train_test_split(X, y, test_size=, random_state=0, stratify=y) 
# or the opposite train_size

In [None]:
S_X_train.shape, S_y_train.shape, S_X_test.shape, S_y_test.shape

## 6) Feature Scaling

In [None]:
"""
### Algorithms that Require Feature Scaling:
1)Logistic Regression --> scaling required (Gradient Descent Based Algorithm)
2)SVM  ---> scaling required (Distance based algorithm)

### Algorithms that DO NOT Require Feature Scaling
1)Decision Tree  --> no scaling required (Tree based algorithm)
2)Naive Bayes ---> no scaling required (due to its assumption of independent features)
3)Random Forest  --> no scaling required (Ensemble method)
"""

### Standardization 

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
print(X_train)
print(X_test)

### Mean Normalization - MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()

S_X_train = ms.fit_transform(S_X_train)
S_X_test = ms.transform(S_X_test)

In [None]:
print(S_X_train)
print(S_X_test)

## 7) Train Model 

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression 

classifier = LogisticRegression() 
classifier.fit(X_train,y_train) 

### SVM - (SVC)

In [None]:
from sklearn.svm import SVC

classifier = SVC(kernel='rbf', random_state=0) # kernel=rbf --> Gaussian Kernel
classifier.fit(X_train, y_train)

###  Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## 8) Test and Evaluate Model

In [None]:
# Make Predictions
y_pred = classifier.predict(X_test)

### Confusion Matrix

In [None]:
# Function to plot confusion matrix using Seaborn's heatmap()

def plot_confusion_matrix(confuse_matrix):
  fig,ax = plt.subplots(figsize=(8,6))

  # Set the font scale
  sns.set(font_scale=1.5)

  ax = sns.heatmap(
      confuse_matrix,
      annot=True, # Annote the boxes
      cbar=False
  )

  plt.ylabel("True label")
  plt.xlabel("Predicted label")

In [None]:
from sklearn.metrics import confusion_matrix

confuse_matrix = confusion_matrix(y_test,y_pred) # compare actual labels (test) with predicted labels
print(confuse_matrix)

In [None]:
plot_confusion_matrix(confuse_matrix)

### Classification Metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_preds(y_test,y_pred):
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred) 
    f1 = f1_score(y_test,y_pred)

    metric_dict = {
        "accuracy":round(accuracy,2),
        "precision":round(precision,2),
        "recall":round(recall,2),
        "f1":round(f1,2)
    } # A dictionary that stores the results of the evaluation metrics
    
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")
    
    return metric_dict

In [None]:
model_metrics = evaluate_preds(y_test, y_pred)

In [None]:
"""
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(acc, precision, recall, f1)
"""