# K Nearest Neighbors with Gzip

An implementation of K Nearest Neighbors using Gzip Normalized Compression Distances as embedding to train it 

## Imports

In [1]:
import gzip
import time
import pickle
import multiprocessing
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score



## Load Dataset

In [2]:
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

In [3]:
# There are a total of 24639 samples
n_samples = 5000

df = pd.read_csv('/kaggle/input/sentiment-and-emotions-of-tweets/sentiment-emotion-labelled_Dell_tweets.csv')
df = df.truncate(0, n_samples)

## Variables definition

In [4]:
X = df['Text']
X.head()

0    @Logitech @apple @Google @Microsoft @Dell @Len...
1    @MK_habit_addict @official_stier @MortalKombat...
2    As @CRN celebrates its 40th anniversary, Bob F...
3    @dell your customer service is horrible especi...
4    @zacokalo @Dell @DellCares @Dell give the man ...
Name: Text, dtype: object

In [5]:
y = df['sentiment']
y.head()

0     neutral
1     neutral
2    positive
3    negative
4     neutral
Name: sentiment, dtype: object

## Data Cleaning and Preparation

In [6]:
X = X.str.replace(r'@[^ ]+', '', regex=True) # Remove tagged users
X = X.str.replace(r'#[^ ]+', '', regex=True) # Remove hashtags
X = X.str.replace(r'http[^ ]+', '', regex=True) # Remove hashtags
X.head()

0           QWERTY were modified for programmers so...
1        She's getting a new   when the one she has...
2    As  its 40th anniversary, Bob Faletra and  wit...
3     your customer service is horrible especially ...
4                       give the man what he paid for!
Name: Text, dtype: object

In [7]:
y_labels = {
    'positive': 1,
    'neutral': 0,
    'negative': -1
}

y = y.map(y_labels) # Map string labels to integers
y.head()

0    0
1    0
2    1
3   -1
4    0
Name: sentiment, dtype: int64

## Train/Test splitting

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=1, 
                                                    stratify=y)

## How to Normalize Compression Distances

In [9]:
X1_compressed = len(gzip.compress(X_train[0].encode()))
X1_compressed

177

In [10]:
X2_compressed = len(gzip.compress(X_train[1].encode()))
X2_compressed

121

In [11]:
XX = len(gzip.compress((" ".join([X_train[0], X_train[1]])).encode()))
XX

255

In [12]:
NCD = (XX - min(X1_compressed, X2_compressed)) / max(X1_compressed, X2_compressed)
NCD

0.7570621468926554

In [13]:
def calculate_ncd(x1, x2):
    X1_compressed = len(gzip.compress(x1.encode()))
    X2_compressed = len(gzip.compress(x2.encode()))  
    XX = len(gzip.compress((" ".join([x1, x2])).encode()))
  
    NCD = (XX - min(X1_compressed, X2_compressed)) / max(X1_compressed, X2_compressed)
    return NCD

In [17]:
def calculate_train_ncd(X_train):
   NCD = [[calculate_ncd(X_train.iloc[i], X_train.iloc[j]) for j in range(len(X_train))] for i in range(len(X_train))]
   return NCD

def calculate_test_ncd(X_test, X_train):
   NCD = [[calculate_ncd(X_test.iloc[i], X_train.iloc[j]) for j in range(len(X_train))] for i in range(len(X_test))]
   return NCD

In [29]:
CPU_CORES = multiprocessing.cpu_count()

with multiprocessing.Pool(CPU_CORES) as pool:
    train_NCD = pool.apply(calculate_train_ncd, [X_train])

with multiprocessing.Pool(CPU_CORES) as pool:
    test_NCD = pool.apply_async(calculate_test_ncd, args=(X_test, X_train))
    test_NCD = test_NCD.get()

## Training

In [32]:
# KNN classification
knn = KNeighborsClassifier(n_neighbors=7) 
knn.fit(train_NCD, y_train)

In [43]:
y_pred = knn.predict(test_NCD)
y_pred

score = accuracy_score(y_test, y_pred, normalize=True)
print('Accuracy: ', score)

Accuracy:  0.6523476523476524
