<a href="https://colab.research.google.com/github/matloob-altaf/Machine-Learning-LUMS/blob/main/Programming_Assignment_1/21100164.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load Datasets

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from scipy.spatial.distance import cdist
import string
import re

In [None]:
#Load datasets
df_train = pd.read_csv("/content/drive/My Drive/Colab Notebooks/ML-PA1/train.csv")
df_test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/ML-PA1/test.csv")

df_train.head(10)

In [None]:
print("Train data dimentions", df_train.shape)
print("Test data dimentions", df_test.shape)


## Preprocessing
*removing the stop words, punctuation marks and other unwanted characters from the tweets and converting them to lower case*

In [None]:
# Load stop words
stopWords = []
with open('/content/drive/My Drive/Colab Notebooks/ML-PA1/stop_words.txt', 'r') as fi:
    for line in fi:
        line = line.rstrip()
        stopWords.append(line)
stopWords += ["can't", 'cannot', 'could', "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i've", "let's", 'ought', "she'd", "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've", "what's", "when's", "where's", "who's", "why's", 'would', 'hey', 'hi']

print(stopWords)
print(len(stopWords))

In [8]:
#Removes unwanted characters from the dataset
def removeUnwantedChars(tweet):
  # Removes urls
  result = re.sub(r'http\S+', '', tweet)
  # Removes the tweeter handle
  result = re.sub('(@)\w+', '', result)
  #  Converts to lower
  result = result.lower()
  # Removes stop words
  result = ' '.join(filter(lambda x: x not in stopWords,  result.split()))
  # Removes numbers
  result = re.sub(r'\d+', '', result)
  # Removes punctuactions
  result = re.sub(r'[^\w\s]','', result)
  # Removes leading and trailing white spaces
  result = result.strip()

  return result

In [None]:
#apply remove unwanted chars function
df_train['preprocessedTweets'] = df_train['Tweet'].apply(removeUnwantedChars)
df_train.head()

In [10]:
df_test['preprocessedTweets'] = df_test['Tweet'].apply(removeUnwantedChars)
df_test.head()

Unnamed: 0,Sentiment,Tweet,preprocessedTweets
0,neutral,@VirginAmerica to jump into the Dallas-Austin ...,jump dallasaustin market
1,positive,"@AmericanAir Chicago seen from seat 6A, AA 162...",chicago seen seat a aa far great ride pdx
2,negative,@united you need a bag bouncer. Get it together,need bag bouncer get together
3,negative,"@JetBlue Hey Jetblue, you stranded an entire p...",jetblue stranded entire plane supposed go jfk ...
4,negative,@USAirways Big fail on not having curbside bag...,big fail curbside baggage pittsburgh charge d...


## Feature Extraction
*representing each tweet as a bag-of-words (BoW), that is, an unordered set of words with their position ignored, keeping only their frequency in the tweet.*

In [11]:
# computes a vector containaing count of each word in tweet against vocabulary
def vectorize(tweet):
    vector=[]
    tokens = tweet.split()
    for w in vocabulary:
        vector.append(tokens.count(w))
    return np.asarray(vector)
# computes all the unique words in dataset
def unique(tweet):
    tokens = tweet.split()
    for x in tokens:
      if not (x in vocabulary):
        vocabulary.append(x)

In [12]:
vocabulary = [] 
#computing vocab
df_train['preprocessedTweets'].apply(unique)
print(vocabulary)



In [13]:
len(vocabulary)

11106

## Part 1 - KNN from Scrach

In [14]:
#apply vectrize function to generate vectors
X_train = df_train['preprocessedTweets'].apply(vectorize)

In [15]:
X_test = df_test['preprocessedTweets'].apply(vectorize)

In [16]:
# Transform categories to 1 for Neutral, 2 for Positive and 3 for Negative
def trans(sentiment):
  if sentiment == "neutral":
    return 1
  elif sentiment == "positive":
    return 2
  else:
    return 3

In [17]:
# convert the datset to 2d array to vectorixe the computation
X_train = np.stack(X_train)
X_test = np.stack(X_test)
#apply trans
Y_train = np.asarray(df_train['Sentiment'].apply(trans))
Y_test = np.asarray(df_test['Sentiment'].apply(trans))

In [None]:
print(X_train.shape)

In [19]:
# Calculates distances 
def distances(train, test):
    dists = np.zeros((test.shape[0], train.shape[0])) 
    dists = np.sqrt(- 2*np.dot(test, train.T) + np.sum(np.square(train), axis = 1) + np.sum(np.square(test), axis = 1)[:, np.newaxis])
    # dists = cdist(test, train, 'euclidean')
    return dists

In [20]:
# Predicts labels
def predictLabels(y_train, dists, k=1):

    y_pred = np.zeros(dists.shape[0])
    for i in range(dists.shape[0]):
      # A list of length k storing the labels of the k nearest neighbors to
      # the ith test point.
      y_closest = []
    
      y_closest = y_train[np.argsort(dists[i])[:k]]
      y_pred[i] = np.argmax(np.bincount(y_closest))

    return y_pred

In [None]:
#Calculate distances
dists = distances(X_train, X_test)
print(dists)

In [22]:
# calculate a confusion matrix
def confusion_matrix(actual, predicted):
	unique = set(actual)
	matrix = [list() for x in range(len(unique))]
	for i in range(len(unique)):
		matrix[i] = [0 for x in range(len(unique))]
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for i in range(len(actual)):
		x = lookup[actual[i]]
		y = lookup[predicted[i]]
		matrix[y][x] += 1
	return unique, matrix
# calculate precision, recall, and f1 score
def precision_recall_f1score(matrix, y_test):
  precision = []
  recall = []
  f1Score = []
  for i in range(len(matrix)):
    totalPositive = matrix[i][0] + matrix[i][1] + matrix[i][2]
    actualPositive = np.count_nonzero(y_test == (i+1))
    for j in range(len(matrix[0])):
      if i == j:
        precision.append(matrix[i][j]/totalPositive)
        recall.append(matrix[i][j]/actualPositive)
    # f1 Score
    f1Score.append(2* (precision[i]*recall[i])/(precision[i]+recall[i]))

  
  return precision, recall, f1Score


In [None]:
K = [1,3,5,7,10]
accuracies = []
precisions = []
recalls = []
f1scores = []
# predict labels and compute classification data for each value of K
for k in K:
  Y_pred = predictLabels(Y_train, dists, k)
  accuracy = (float(np.sum(Y_pred == Y_test)) / Y_test.shape[0])*100
  unique, matrix = confusion_matrix(Y_test, Y_pred)
  prec, rec, f1Sco = precision_recall_f1score(matrix, Y_test)
  accuracies.append(accuracy)
  precisions.append(prec)
  recalls.append(rec)
  f1scores.append(f1Sco)
  # Print
  print("K = ", k)
  print('Accuracy: %f %%' % accuracy)
  print("Precision: ", np.average(prec))
  print("Recall:    ", np.average(rec))
  print("F1Score:   ", np.average(f1Sco))
  print("Confusion Matrix", matrix)

  print("_____________________________________________________________________________________________\n")

### Plots

In [None]:
# plot accuracy
plt.bar(K, accuracies, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Accuracy')
plt.xlabel('Value of K')
plt.title('Accuracy Comparison')
plt.show()

In [None]:
# plot precision
plt.bar(K, np.average(precisions, axis=1), align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Precision')
plt.xlabel('Value of K')
plt.title('Precision Comparison')

plt.show()

In [None]:
# plot recall
plt.bar(K, np.average(recalls, axis=1), align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Recall')
plt.xlabel('Value of K')
plt.title('Recall Comparison')

plt.show()

In [None]:
# plot f1 scores
plt.bar(K, np.average(f1scores, axis=1), align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('F1 Score')
plt.xlabel('Value of K')
plt.title('F1 Score Comparison')

plt.show()

## Part 2 - scikit-learnâ€™s kNN

In [28]:
# import libraries for scikit learn's knn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [31]:
# KNN returns classification data for each value of k
def KNN(k, x_train, x_test, y_train, y_test):
  #Setup a knn classifier with k neighbors
  knn = KNeighborsClassifier(n_neighbors=k)
  #Fit the model
  knn.fit(x_train, y_train)
  #Compute accuracy on the training set
  train_accuracy = knn.score(x_train, y_train)
  #Compute accuracy on the test set
  test_accuracy = knn.score(x_test, y_test) 
  #Make predictions
  y_pred = knn.predict(x_test)
  #Compute confusion matrix
  sk_matrix = multilabel_confusion_matrix(y_test, y_pred)
  #Compute Precision
  sk_prec = precision_score(y_test, y_pred, average='macro')
  #Compute recall
  sk_rec = recall_score(y_test, y_pred, average='macro')
  #Compute F1 Score
  sk_f1sco = f1_score(y_test, y_pred, average='macro')

  return train_accuracy, test_accuracy, sk_matrix, sk_prec, sk_rec, sk_f1sco

In [None]:
K = [1,3,5,7,10]
# store computed data
train_accuracies =np.empty(len(K))
test_accuracies = np.empty(len(K))
sk_precisions = np.empty(len(K))
sk_recalls = np.empty(len(K))
sk_f1scores = np.empty(len(K))
# compute knn for each k
for i ,k in enumerate(K):
    #Compute
    train_accuracies[i], test_accuracies[i], sk_matrix, sk_precisions[i], sk_recalls[i], sk_f1scores[i] = KNN(k ,X_train, X_test, Y_train, Y_test)

    print("K = ", k)
    print('Train Accuracy: %f %%' % train_accuracies[i])
    print('Test Accuracy: %f %%' % test_accuracies[i])
    print("Precision: ", sk_precisions[i])
    print("Recall:    ", sk_recalls[i])
    print("F1Score:   ", sk_f1scores[i])
    print("Confusion Matrix\n", sk_matrix)

    print("_____________________________________________________________________________________________\n")

### Plots

In [None]:
#plot train accuracy
plt.bar(K, train_accuracies, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Train Accuracy')
plt.xlabel('Value of K')
plt.title('Train Accuracy Comparison')
plt.show()

In [None]:
#plot test accuracy
plt.bar(K, test_accuracies, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Test Accuracy')
plt.xlabel('Value of K')
plt.title('Test Accuracy Comparison')
plt.show()

In [None]:
#plot precisions
plt.bar(K, sk_precisions, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Precision')
plt.xlabel('Value of K')
plt.title('Precision Comparison')
plt.show()

In [None]:
#plot recalls
plt.bar(K, sk_recalls, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Recall')
plt.xlabel('Value of K')
plt.title('Recall Comparison')
plt.show()

In [None]:
#plot f1scores
plt.bar(K, sk_f1scores, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('F1 Score')
plt.xlabel('Value of K')
plt.title('F1 Score Comparison')
plt.show()

## Part 3 - Word2Vec

In [None]:
# import genism for part 3
from gensim.models import KeyedVectors

In [None]:
# loading vectors
word2vec = KeyedVectors.load_word2vec_format("/content/drive/My Drive/Colab Notebooks/ML-PA1/GoogleNews-vectors-negative300.bin.gz", binary=True)

In [None]:
# Feature extractor
def extract_features(text):
  words = [word for word in text.split() if word in word2vec.vocab]
  if len(words) == 0:
    return np.asarray([])
  return np.asarray(np.mean(word2vec[words], axis=0))

In [None]:
# vectorize the data
X_train = np.asarray(df_train['Tweet'].apply(extract_features))
X_test = np.asarray(df_test['Tweet'].apply(extract_features))
# apply trans to convert classes from string to int
Y_train = np.asarray(df_train['Sentiment'].apply(trans))
Y_test = np.asarray(df_test['Sentiment'].apply(trans))

In [None]:
# Removes empy entries
def cleanData(x, y):
  entriesToDelete = []
  for i in range(x.size):
    if x[i].size == 0:
      entriesToDelete.append(i)
  y = np.delete(y, entriesToDelete )
  x = np.delete(x, entriesToDelete )
  return x, y

In [None]:
#Removing the entries for which there was no word in word2vec vocab
X_train, Y_train = cleanData(X_train, Y_train)
X_test, Y_test = cleanData(X_test, Y_test)   

In [None]:
# convert to 2d array for vectorize calculations
X_train = np.stack(X_train)
X_test  = np.stack(Y_train)

### Part 1 with Word2Vec

In [None]:
#Calculate distances
dists = distances(X_train, X_test)
print(dists)

In [None]:
K = [1,3,5,7,10]
accuracies = []
precisions = []
recalls = []
f1scores = []
# predict and compute classification data for each value of k
for k in K:
  Y_pred = predictLabels(Y_train, dists, k)
  accuracy = (float(np.sum(Y_pred == Y_test)) / Y_test.shape[0])*100
  unique, matrix = confusion_matrix(Y_test, Y_pred)
  prec, rec, f1Sco = precision_recall_f1score(matrix, Y_test)
  accuracies.append(accuracy)
  precisions.append(prec)
  recalls.append(rec)
  f1scores.append(f1Sco)
  # Print
  print("Word2Vec Implementation")
  print("K = ", k)
  print('Accuracy: %f %%' % accuracy)
  print("Precision: ", np.average(prec))
  print("Recall:    ", np.average(rec))
  print("F1Score:   ", np.average(f1Sco))
  print("Confusion Matrix", matrix)

  print("_____________________________________________________________________________________________\n")

#### Plots

In [None]:
# plot accuracy
plt.bar(K, accuracies, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Accuracy')
plt.xlabel('Value of K')
plt.title('Accuracy Comparison')
plt.show()

In [None]:
# plot precisions
plt.bar(K, np.average(precisions, axis=1), align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Precision')
plt.xlabel('Value of K')
plt.title('Precision Comparison')

plt.show()

In [None]:
# plot recalls
plt.bar(K, np.average(recalls, axis=1), align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Recall')
plt.xlabel('Value of K')
plt.title('Recall Comparison')

plt.show()

In [None]:
# plot f1 scores
plt.bar(K, np.average(f1scores, axis=1), align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('F1 Score')
plt.xlabel('Value of K')
plt.title('F1 Score Comparison')

plt.show()

### Part 2 with Word2Vec

In [None]:
K = [1,3,5,7,10]
# store computed data
train_accuracies =np.empty(len(K))
test_accuracies = np.empty(len(K))
sk_precisions = np.empty(len(K))
sk_recalls = np.empty(len(K))
sk_f1scores = np.empty(len(K))

for i ,k in enumerate(K):
    #Compute
    train_accuracies[i], test_accuracies[i], sk_matrix, sk_precisions[i], sk_recalls[i], sk_f1scores[i] = KNN(k ,X_train, X_test, Y_train, Y_test)

    print("K = ", k)
    print('Train Accuracy: %f %%' % train_accuracies[i])
    print('Test Accuracy: %f %%' % test_accuracies[i])
    print("Precision: ", sk_precisions[i])
    print("Recall:    ", sk_recalls[i])
    print("F1Score:   ", sk_f1scores[i])
    print("Confusion Matrix", sk_matrix)

    print("_____________________________________________________________________________________________\n")

#### Plots

In [None]:
#plot train accuracy
plt.bar(K, train_accuracies, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Train Accuracy')
plt.xlabel('Value of K')
plt.title('Train Accuracy Comparison')
plt.show()

In [None]:
#plot test accuracy
plt.bar(K, test_accuracies, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Test Accuracy')
plt.xlabel('Value of K')
plt.title('Test Accuracy Comparison')
plt.show()

In [None]:
#plot precisions
plt.bar(K, sk_precisions, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Precision')
plt.xlabel('Value of K')
plt.title('Precision Comparison')
plt.show()

In [None]:
#plot recalls
plt.bar(K, sk_recalls, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('Recall')
plt.xlabel('Value of K')
plt.title('Recall Comparison')
plt.show()

In [None]:
#plot f1scores
plt.bar(K, sk_f1scores, align='center', alpha=0.5)
plt.xticks(K)
plt.ylabel('F1 Score')
plt.xlabel('Value of K')
plt.title('F1 Score Comparison')
plt.show()