## Download and import packages

In [None]:
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn
# !pip install matplotlib

#import required packages

from numpy import mean, std
import numpy as np
import pandas as pd
from pandas import read_csv
import matplotlib.pyplot as plt
import gdown

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

## Download Data and Load Data

In [None]:
# Download from Google Drive
url = 'https://drive.google.com/uc?id=' + '1BmICPGpdRg1dPmXi0G3Fe5IWs1MobCu8' #(URI ID)
output = '/home/kiran/ta/data/iris.data' # Destination directory
gdown.download(url, output, quiet=False) 

In [None]:
#Load the data using pandas read_csv method
df=read_csv(output, sep=",", header=None)

In [None]:
#Loading the dataset
iris = datasets.load_iris()
#Creating a dataframe
df = pd.DataFrame(iris.data)
df['class'] = iris.target
y = df['class'].values
x = df.drop(['class'],axis=1).values

In [None]:
# Task1 Display the first few rows of the data

## Visualize Data

In [None]:
df.info()

In [None]:
#Task2: Visualize the histogram of data classes

In [None]:
#Task2: Plot the histogram of values from any other attribute of choice

## Prepare data

To assess our model’s performance later, we will need to divide the data set into two parts: a training set and a
test set. The first is used to train the system, while the second is used to evaluate the learned or trained system.

Sklearn provides us with an easy way to randomly break up our data. We have decided to split the data with 20% as test and 80% as training.

In [None]:
#Splitting the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

## Train the Model

Hyper Parameters for Logistic Regression
1. penalty: Used to specify the norm used in the penalization. 
2. C: Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
3. max_iter: Maximum number of iterations taken for the solvers to converge.

Epoch: One Epoch is when an entire dataset is passed forward and backward through the classifier / neural network only once.

Iterations: Iterations is the number of batches that is passed forward and backward through the classifier.

In [None]:
# Delcare a Logistic Regression classifier
clf = LogisticRegression(penalty='l2',C=1.0, max_iter=10000)
# Train the classifier until max_iterations
clf.fit(X_train, y_train)

Hyper Parameters for Decision Trees
1. criterion: The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.
 - Gini: The gini impurity measures the frequency at which any element of the dataset will be mislabelled when it is randomly labeled.
 - Entropy is a measure of information that indicates the disorder of the features with the target.
2. splitter: The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split.
3. max_depth: The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples
4. min_samples_split: The minimum number of samples required to split an internal node

In [None]:
# Declare a Decision Tree classifier
clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2)
# Create Decision Tree on the training data
clf.fit(X_train, y_train)

Hyper Parameters for SVM
1. kernel: (‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed') Specifies the kernel type to be used in the algorithm.
2. degree: Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.
3. gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
4. max_iter: Hard limit on iterations within solver, or -1 for no limit.

In [None]:
# Declare the SVM classifier
clf = SVC(kernel='poly', degree=3, max_iter=300000)
# Train until max iterations
clf.fit(X_train, y_train)

Hyper Parameters for LinearRegression:
1. fit_intercept: Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered)
2. normalize: If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm.

In [None]:
# Linear classifier
reg = LinearRegression()
reg.fit(X_train, y_train)

Gaussian Naive Bayes:
The likelihood of the features is assumed to be Gaussian. Paramters are
1. priors: Prior probabilities of the classes. If specified the priors are not adjusted according to the data.
2. var_smoothing: Portion of the largest variance of all features that is added to variances for calculation stability.

In [None]:
# Gaussian Naive Bayes classifier
clf = GaussianNB()
clf.fit(X_train, y_train)

In [None]:
# Task 3 Implement KNN using sklearn
# Show results for using both Euclidean distance and Manhattan Distance metric for the KNN classifier.

In [None]:
# Task 4 Implement two other classifiers of your choice

## Evaluate the Model

Now we predict using our trained model on the test set we created and evaluate our model on unforeseen data.
The performance will be reflected in various standard metrics.

In [None]:
#Predicting for test data
y_pred = clf.predict(X_test)

In [None]:
#Calculating results for various evaluation metric
precision = precision_score(y_test,y_pred, average='micro')
recall = recall_score(y_test,y_pred, average='micro')
accuracy = accuracy_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred, average='macro')

In [None]:
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1-score: {f1}")

## Visualization of Results

In [None]:
## Task 5: Insert the code for visualizing the comfusion matrix here
#store the confusion matrix in the variable cm with dim:2x2

In [None]:
df_cm = pd.DataFrame(cm, range(cm.shape[0]), range(cm.shape[1]))
sns.set(font_scale=1) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

## Comparing results with k-fold validation

In [None]:
# Implement K-fold validation and compare the perfromance with 80-20 random split (using sklearn methods)