# Gzip is all you need

In [24]:
import gzip
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

### Reading the data

In [2]:
data = pd.read_csv(r"C:\Users\123na\Downloads\Gzip_is_all_you_need\combined_reviews.csv")
data = data.sample(n = len(data))
data.drop(labels = ["source"], axis = 1, inplace = True)

data.head()

Unnamed: 0,text,labels
763,This particular model would not work with my M...,0
397,Great product and price.,1
2109,Sooooo good!!,1
386,Poor sound quality.,0
1657,Macbeth (Jason Connery) moved me to tears with...,1


In [3]:
train_x, train_y, test_x, test_y = data["text"].values[:int(0.8*len(data))], data["labels"].values[:int(0.8*len(data))], data["text"].values[int(0.8*len(data)):], data["labels"].values[int(0.8*len(data)):]

In [4]:
for _ in range(5):
    idx = np.random.randint(0, 500)
    print(f"The text is : {train_x[idx]} \nThe corresponding label is : {train_y[idx]} \n\n")

The text is : This movie suffered because of the writing, it needed more suspense.   
The corresponding label is : 0 


The text is : They also have the best cheese crisp in town. 
The corresponding label is : 1 


The text is : Full of unconvincing cardboard characters it is blandly written by Edward Chodorov, who also produced, and is surprisingly directed by Jean Negulesco from whom one would expect a great deal more.   
The corresponding label is : 0 


The text is : All three broke within two months of use. 
The corresponding label is : 0 


The text is : I really like this product over the Motorola because it is allot clearer on the ear piece and the mic. 
The corresponding label is : 1 




### Compressing the data using Gzip and creating NCD distances

In [5]:
# NCD function
def NCD(string1, string2):
    x1 = len(gzip.compress(string1.encode()))
    x2 = len(gzip.compress(string2.encode()))
    x1_x2 = len(gzip.compress((string1 + string2).encode()))
    return (x1_x2 - min(x1, x2))/(max(x1, x2))

In [6]:
train_ncd = [[NCD(i, j) for j in train_x] for i in train_x]

In [7]:
test_ncd = [[NCD(i, j) for j in train_x] for i in test_x]

### Testing various models

In [10]:
model_list_knn = []
accuracies_knn = []
for i in range(10):
    model = KNeighborsClassifier(n_neighbors = i + 1)
    model.fit(train_ncd, train_y)
    score = model.score(test_ncd, test_y)
    accuracies_knn.append(score)
    model_list_knn.append(model)
    print(f"For num_neighbours = {i + 1} \nThe accuracy is {score:.3f}\n\n")

For num_neighbours = 1 
The accuracy is 0.560


For num_neighbours = 2 
The accuracy is 0.598


For num_neighbours = 3 
The accuracy is 0.603


For num_neighbours = 4 
The accuracy is 0.608


For num_neighbours = 5 
The accuracy is 0.602


For num_neighbours = 6 
The accuracy is 0.602


For num_neighbours = 7 
The accuracy is 0.603


For num_neighbours = 8 
The accuracy is 0.582


For num_neighbours = 9 
The accuracy is 0.568


For num_neighbours = 10 
The accuracy is 0.575




In [12]:
model_list_nuSVC_poly = []
accuracies_svm_poly = []
for i in range(5):
    model = NuSVC(kernel = 'poly', degree = i + 1)
    model.fit(train_ncd, train_y)
    score = model.score(test_ncd, test_y)
    accuracies_svm_poly.append(score)
    model_list_nuSVC_poly.append(model)
    print(f"For degree = {i + 1} \nThe accuracy is {score:.3f}\n\n")

For degree = 1 
The accuracy is 0.733


For degree = 2 
The accuracy is 0.742


For degree = 3 
The accuracy is 0.740


For degree = 4 
The accuracy is 0.735


For degree = 5 
The accuracy is 0.733




In [26]:
model_list_tree = []
accuracies_tree = []
for i in range(10):
    model = ExtraTreeClassifier(max_depth = i + 5)
    model.fit(train_ncd, train_y)
    score = model.score(test_ncd, test_y)
    accuracies_tree.append(score)
    model_list_tree.append(model)
    print(f"For depth = {i + 1} \nThe accuracy is {score:.3f}\n\n")

For depth = 1 
The accuracy is 0.532


For depth = 2 
The accuracy is 0.512


For depth = 3 
The accuracy is 0.557


For depth = 4 
The accuracy is 0.565


For depth = 5 
The accuracy is 0.555


For depth = 6 
The accuracy is 0.532


For depth = 7 
The accuracy is 0.535


For depth = 8 
The accuracy is 0.523


For depth = 9 
The accuracy is 0.503


For depth = 10 
The accuracy is 0.540




In [25]:
model_list_forest = []
accuracies_forest = []
for i in range(10):
    model = RandomForestClassifier(max_depth = i + 1, n_estimators = 100)
    model.fit(train_ncd, train_y)
    score = model.score(test_ncd, test_y)
    accuracies_forest.append(score)
    model_list_forest.append(model)
    print(f"For depth = {i + 1} \nThe accuracy is {score:.3f}\n\n")

For depth = 1 
The accuracy is 0.538


For depth = 2 
The accuracy is 0.597


For depth = 3 
The accuracy is 0.608


For depth = 4 
The accuracy is 0.627


For depth = 5 
The accuracy is 0.640


For depth = 6 
The accuracy is 0.622


For depth = 7 
The accuracy is 0.623


For depth = 8 
The accuracy is 0.632


For depth = 9 
The accuracy is 0.635


For depth = 10 
The accuracy is 0.643


