# Packages

### install packages

In [1]:
# !pip install numpy as np
# !pip install sklearn
# !pip install nltk
# !pip install matplotlib

### import packages

In [2]:
import os
import json
import collections

import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score

import nltk
from nltk import NaiveBayesClassifier
from nltk import f_measure, precision

nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/hisham/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/hisham/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/hisham/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/hisham/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/hisham/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/hisham/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package name

True

# Uploading Data

### uploading tfidf-encoded datasets, stemmed tweets (target datasets) and vocabs

In [4]:
data_folder = 'sample_data'
data_folder = os.path.join(os.curdir,data_folder)
tfidf_encoded_datasets_path = os.path.join(data_folder,'encoding_matrix.json')
stemmed_tweets_path = os.path.join(data_folder,'stemmed_tweets.json')
vocabs_path = os.path.join(data_folder,'vocabs.json')

tfidf_encoded_matrix = {}
stemmed_tweets = {}
vocabs = {}

with open(tfidf_encoded_datasets_path, 'r') as f:
  tfidf_encoded_matrix = json.load(f)

with open(stemmed_tweets_path,encoding='utf-8') as f:
  stemmed_tweets = json.load(f)

with open(vocabs_path,encoding='utf-8') as f:
  vocabs = json.load(f)

# input dataset for k-neighbors
train_encoded_input = tfidf_encoded_matrix["train_data_encoding_matrix"]
test_encoded_input = tfidf_encoded_matrix["test_data_encoding_matrix"]

# input dataset for naive bayes
all_vocab_words = vocabs['vocab_to_index'].keys()
train_tweet_input = stemmed_tweets["train_input"]
test_tweet_input = stemmed_tweets["test_input"]

# outputs for both
train_output = stemmed_tweets["train_output"]
test_output = stemmed_tweets["test_output"]

# K-nearset neighbors

### Modeling

In [5]:
scaler = StandardScaler()
scaler.fit(train_encoded_input)
x_train = scaler.transform(train_encoded_input)
x_test = scaler.transform(test_encoded_input)
n_neigbors = 5
k_neighbors_classifier = KNeighborsClassifier(n_neighbors=n_neigbors)
k_neighbors_classifier.fit(x_train,train_output)

### Evaluation

##### predection

In [6]:
test_pred = k_neighbors_classifier.predict(x_test)

##### classification report

In [7]:
print(classification_report(test_output,test_pred))

              precision    recall  f1-score   support

         neg       0.69      0.71      0.70      5735
         pos       0.70      0.67      0.69      5691

    accuracy                           0.69     11426
   macro avg       0.69      0.69      0.69     11426
weighted avg       0.69      0.69      0.69     11426



##### confusion matrix

In [8]:
print(confusion_matrix(test_output,test_pred))

[[4089 1646]
 [1862 3829]]


##### f1 score

In [9]:
print("f1 score for neg is",f1_score(test_output,test_pred, pos_label='neg'))
print("f1 score for pos is",f1_score(test_output,test_pred, pos_label='pos'))

f1 score for neg is 0.699811740544241
f1 score for pos is 0.6858319899695503


# Naive Bayes

### Modeling

In [10]:
def find_features(tweet:str, index:int):
  words = set(tweet.split(" "))
  features = {}
  for vocab in all_vocab_words:
    features.update({vocab:(vocab in words)})
  return features
train_feature_set_list = [(find_features(train_tweet_input[index],index),train_output[index]) for index in range(len(train_tweet_input))]
naive_bayes_classifier = NaiveBayesClassifier.train(train_feature_set_list)

### Evaluation

In [11]:
test_feature_list = [find_features(test_tweet_input[index],index) for index in range(len(test_tweet_input))]
test_pred = naive_bayes_classifier.classify_many(test_feature_list)

In [19]:
ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i,label in enumerate(test_output):
    ref_sets[label].add(i)
    observed = test_pred[i]
    test_sets[observed].add(i)

##### precision

In [20]:
print('pos precision: ', precision(ref_sets['pos'], test_sets['pos']))
print('neg precision: ', precision(ref_sets['neg'], test_sets['neg']))

pos precision:  0.9047171584288668
neg precision:  0.8563472563472564


##### f1 score

In [21]:
print('positive f-score:', f_measure(ref_sets['pos'], test_sets['pos']))
print('negative f-score:', f_measure(ref_sets['neg'], test_sets['neg']))

positive f-score: 0.874318924809299
negative f-score: 0.8831081081081081
