In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Set the training and validation paths of the respective CSVs
TRAINING_PATH = "/content/drive/MyDrive/1:1_Hanish_Acharla/Dataset/Featurization_Vani/train_augment.csv"
VALIDATION_PATH = "/content/drive/MyDrive/1:1_Hanish_Acharla/Dataset/Featurization_Vani/validation_augment.csv"

In [None]:
import pandas as pd
training_data=pd.read_csv(TRAINING_PATH)
validation_data=pd.read_csv(VALIDATION_PATH)

In [None]:
#label encoding for training data
labels = training_data['label'].unique()
labels.sort()
print(labels)
valid_labels = validation_data['label'].unique()
valid_labels.sort()
print(valid_labels)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# training data label distribution
plt.figure(figsize = (15,5))
sns.countplot(data = training_data, x = "label")
plt.title("Label Distribution for Training Data")
plt.xlabel("Category")
plt.ylabel("Label count")
plt.show();

In [None]:
# validation data label distribution
plt.figure(figsize = (15,5))
sns.countplot(data = validation_data, x = "label")
plt.title("Label Distribution for Validation Data")
plt.xlabel("Category")
plt.ylabel("Label count")
plt.show();

In [None]:
#label encoding the training data
training_data['label'] = pd.factorize(training_data['label'], sort = True)[0]
training_data.head()

In [None]:
validation_data['label'] = pd.factorize(validation_data['label'], sort = True)[0]
validation_data.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train = training_data.iloc[:, :-1].values
x_valid = validation_data.iloc[:, :-1].values
y_train = training_data.iloc[:, -1].values
y_valid = validation_data.iloc[:, -1].values
print(x_train.shape)
print(x_valid.shape)
print(y_train.shape)
print(y_valid.shape)

In [None]:
# result/confusion matrix function
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
def model_evaluations(y_true, y_pred):
  import matplotlib.pyplot as plt
  import seaborn as sns
  acc_score = accuracy_score(y_true, y_pred)
  print("Accuracy score: {}\n".format(acc_score))
  print("Classification Report: {}".format(classification_report(y_true, y_pred)))
  plt.figure(figsize = (10,10))
  sns.heatmap(confusion_matrix(y_true, y_pred),  annot = True, fmt="g", cmap = "Blues", xticklabels = labels, yticklabels = labels)
  plt.title("Consfuion Matrix")
  plt.show()

In [None]:
#KNN HyperParam tuning
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
K = [i for i in range(2,15)]#Number of Neigbors Hyperparam
accuracies = []
for k_value in K:
  main_model = KNeighborsClassifier(n_neighbors=k_value, n_jobs = -1)
  main_model.fit(x_train,y_train)
  y_pred = main_model.predict(x_valid)
  accuracies.append(accuracy_score(y_valid, y_pred))
  print("Neighbour {} experiment done".format(k_value))

In [None]:
print(accuracies)

In [None]:
#KNN Training Accuracy graph

import numpy as np
import matplotlib.pyplot as plt
# line 1 points
# plotting the line 1 points
plt.plot(K, accuracies)
plt.xlabel('Number of neighbors')
# Set the y axis label of the current axis.
plt.ylabel('Accuracy')
# Set a title of the current axes.
plt.title('Accuracies vs K Neighbors')
# show a legend on the plot
# This shows which line belongs to which plot
# Note that the legend displays whatever we entered as label,
# when specifying plt.plot arguments above
# Display a figure.
plt.xticks([i for i in range(16)])
plt.grid(True)

plt.show()

In [None]:
#best model
best_model_one = KNeighborsClassifier(n_neighbors=13, n_jobs = -1)
best_model_one.fit(x_train, y_train)
best_ypred = best_model_one.predict(x_valid)
model_evaluations(y_valid, best_ypred)

In [None]:
#Save KNN
import pickle
fh = open("/content/drive/MyDrive/1:1_Hanish_Acharla/Models/KNN_Best_Model", "wb")
pickle.dump(best_model_one, fh)
fh.close()

Random Forest

In [None]:
#RF Hyper Parameter tuning
from sklearn.ensemble import RandomForestClassifier
max_depth = [1,2,3,4,5,6,7]
n_trees = [i for i in range(10,110,10)]
all_acc = []
for depth in max_depth:
  acc = []
  for tree in n_trees:
    model1 = RandomForestClassifier(n_estimators=tree, max_depth=depth, n_jobs = -1)
    model1.fit(x_train,y_train)
    y_pred = model1.predict(x_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    acc.append(accuracy)
    print("Depth: {} and Tree: {} done".format(depth, tree))
  all_acc.append(acc)

In [None]:
print(all_acc)

In [None]:
#RF Train accuracy plot
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize = (15,10))
# line 1 points
# plotting the line 1 points
for i in range(len(max_depth)):
  plt.plot(n_trees, all_acc[i],"*-",label = "Depth: {}".format(max_depth[i]))
plt.xlabel('Number of Trees')
# Set the y axis label of the current axis.
plt.ylabel('Accuracy')
# Set a title of the current axes.
plt.title('Accuracies with Respect to Maximum Depth and Number of Trees')
# show a legend on the plot
# This shows which line belongs to which plot
# Note that the legend displays whatever we entered as label,
# when specifying plt.plot arguments above
# Display a figure.
plt.xticks([i for i in range(10,110,10)])
plt.grid(True)

plt.legend()

plt.show()

In [None]:
#best model
best_model_two = RandomForestClassifier(max_depth=7, n_estimators=100)
best_model_two.fit(x_train, y_train)
best_ypred = best_model_two.predict(x_valid)
model_evaluations(y_valid, best_ypred)

In [None]:
#Save best RF Model
import pickle
fh = open("/content/drive/MyDrive/1:1_Hanish_Acharla/Models/RF_Best_Model", "wb")
pickle.dump(best_model_two, fh)
fh.close()

MLP

In [None]:
#MLP Hyper parameter Tuning
from sklearn.neural_network import MLPClassifier
learning_rate = [0.01, 0.05, 0.001, 0.0001, 0.00001]
epochs = [i for i in range(100,200,10)]
all_acc_mlp = []
for lr in learning_rate:
  acc_mlp = []
  for epo in epochs:
    model1 = MLPClassifier(learning_rate_init=lr, max_iter=epo)
    model1.fit(x_train,y_train)
    y_pred = model1.predict(x_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    acc_mlp.append(accuracy)
    print("Depth: {} and Learning Rate: {} done".format(lr, epo))
  all_acc_mlp.append(acc_mlp)

In [None]:
#MPL Train accuracy graph
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize = (15,10))
# line 1 points
# plotting the line 1 points
for i in range(len(learning_rate)):
  plt.plot(epochs, all_acc_mlp[i],"*-",label = "Learning Rate: {}".format(learning_rate[i]))
plt.xlabel('Number of Epochs')
# Set the y axis label of the current axis.
plt.ylabel('Accuracy')
# Set a title of the current axes.
plt.title('Accuracies with Respect to Epochs and Learning Rate')
# show a legend on the plot
# This shows which line belongs to which plot
# Note that the legend displays whatever we entered as label,
# when specifying plt.plot arguments above
# Display a figure.
plt.xticks([i for i in range(100,200,10)])
plt.grid(True)

plt.legend()

plt.show()

In [None]:
#best model
best_model_three = MLPClassifier(learning_rate_init=0.001, max_iter=160)
best_model_three.fit(x_train, y_train)
best_ypred = best_model_three.predict(x_valid)
model_evaluations(y_valid, best_ypred)

In [None]:
print(all_acc_mlp)

In [None]:
#Save MLP
import pickle
fh = open("/content/drive/MyDrive/1:1_Hanish_Acharla/Models/MLP_Best_Model", "wb")
pickle.dump(best_model_three, fh)
fh.close()