In [1]:
import spacy
import csv
import tqdm
import math
import random
from collections import Counter, defaultdict
import pandas as pd

In [2]:
data = []
with open('yelp_reviews_train.csv', 'r') as csv_file:
    file = csv.reader(csv_file)
    for row in file:
        data.append(row)
random.shuffle(data)

In [3]:
nlp = spacy.load("en_core_web_sm")

In [8]:
results_df = pd.DataFrame(columns=['variation', 'iteration', 'accuracy', 'precision', 'recall', 'F1'])

## No text pre-processing

In [4]:
no_pre = []
for row in tqdm.tqdm(data, desc = 'Getting words'):
    text = row[0]
    label = row[1]
    words = text.lower().split(' ')
    no_pre.append((words, label))

Getting words: 100%|██████████| 8137/8137 [00:00<00:00, 48931.10it/s]


In [22]:
def train_test(data, j, variation = None):
    IDX = list(range(len(data)))
    test_set = []
    train_set = []
    
    for i in IDX:
        if i % 5 == j:
            test_set.append(data[i])
        else:
            train_set.append(data[i])
    
    print(f'Train set N°{j+1}')
    
    priors_count = defaultdict(int)
    likelihood_count = defaultdict(Counter)
    vocabulary = set()
    
    for words, label in tqdm.tqdm(train_set, desc = 'Training'):
        priors_count[label] += 1
        likelihood_count[label].update(words)
        vocabulary.update(words)
    
    vocabulary = list(vocabulary)
    priors_probs = {cls: priors_count[cls] / sum(priors_count.values()) for cls in priors_count}

    likelihood = defaultdict(lambda: defaultdict(float))
    for cls in likelihood_count:
        total_words_in_class = sum(likelihood_count[cls].values())
        denom = total_words_in_class + len(vocabulary)
        for word in vocabulary:
            likelihood[cls][word] = (likelihood_count[cls][word] + 1) / denom

    accuracy = 0
    total_sentences = 0
    tp = 0
    positives = 0
    priors_count_test = defaultdict(int)

    for words, label in tqdm.tqdm(test_set, desc = 'Testing'):
        priors_count_test[label] += 1
        calc_prob = {cls: math.log(priors_probs[cls]) for cls in priors_count}
        for word in words:
            if word in vocabulary:
                for cls in priors_count:
                    calc_prob[cls] += math.log(likelihood[cls][word])

        prediction = 'high' if calc_prob['high'] > calc_prob['low'] else 'low'
        positives += prediction == 'high'
        accuracy += prediction == label
        tp += prediction == 'high' and label == 'high'
        total_sentences += 1

    acc_total = accuracy / total_sentences
    precision = tp / positives if positives > 0 else 0
    recall = tp / priors_count_test['high'] if priors_count_test['high'] > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f'El accuracy total es de {acc_total}')
    print(f'La precision es de {precision}')
    print(f'El recall es de {recall}')
    print(f'El F1 score es de {f1}')
    
    if variation is None:
        print('')
    else:
        results_df.loc[len(results_df)] = [variation, j+1, acc_total, precision, recall, f1]
    return priors_probs, likelihood, vocabulary

In [10]:
for j in range(0, 5):
    train_test(no_pre, j, 'no pre-processing')

Train set N°1


Training: 100%|██████████| 6509/6509 [00:00<00:00, 32942.77it/s]
Testing: 100%|██████████| 1628/1628 [04:12<00:00,  6.45it/s]


El accuracy total es de 0.8372235872235873
La precision es de 0.8378712871287128
El recall es de 0.997789240972734
El F1 score es de 0.9108644466868484
Train set N°2


Training: 100%|██████████| 6509/6509 [00:00<00:00, 33965.88it/s]
Testing: 100%|██████████| 1628/1628 [04:01<00:00,  6.73it/s]


El accuracy total es de 0.8151105651105651
La precision es de 0.8152709359605911
El recall es de 0.999245283018868
El F1 score es de 0.897931502204137
Train set N°3


Training: 100%|██████████| 6510/6510 [00:00<00:00, 29478.87it/s]
Testing: 100%|██████████| 1627/1627 [03:42<00:00,  7.32it/s]


El accuracy total es de 0.828518746158574
La precision es de 0.8300370828182941
El recall es de 0.9970304380103935
El F1 score es de 0.905902192242833
Train set N°4


Training: 100%|██████████| 6510/6510 [00:00<00:00, 36884.18it/s]
Testing: 100%|██████████| 1627/1627 [03:53<00:00,  6.96it/s]


El accuracy total es de 0.8303626306084819
La precision es de 0.8306650246305419
El recall es de 0.9992592592592593
El F1 score es de 0.9071956960322799
Train set N°5


Training: 100%|██████████| 6510/6510 [00:00<00:00, 38072.99it/s]
Testing: 100%|██████████| 1627/1627 [03:58<00:00,  6.81it/s]


El accuracy total es de 0.8149969268592502
La precision es de 0.8147004323656578
El recall es de 0.9992424242424243
El F1 score es de 0.8975842123171146


## Lemmatization

In [11]:
def preprocess_text(text, lemmatize = True, remove_stopwords = False):
    doc = nlp(text.lower())
    if lemmatize:
        if remove_stopwords:
            return [token.lemma_ for token in doc if not token.is_stop]
        return [token.lemma_ for token in doc]

In [12]:
preprocessed_data = []
for row in tqdm.tqdm(data, desc="Preprocessing Data"):
    preprocessed_data.append((preprocess_text(row[0], lemmatize = True, remove_stopwords = False), row[1]))

Preprocessing Data: 100%|██████████| 8137/8137 [03:59<00:00, 33.97it/s]


In [13]:
for j in range(0, 5):
    train_test(preprocessed_data, j, 'lemmatization')

Train set N°1


Training: 100%|██████████| 6509/6509 [00:00<00:00, 29611.68it/s]
Testing: 100%|██████████| 1628/1628 [00:53<00:00, 30.63it/s]


El accuracy total es de 0.8409090909090909
La precision es de 0.8635761589403973
El recall es de 0.9609432571849669
El F1 score es de 0.9096616672479944
Train set N°2


Training: 100%|██████████| 6509/6509 [00:00<00:00, 38978.32it/s]
Testing: 100%|██████████| 1628/1628 [00:51<00:00, 31.48it/s]


El accuracy total es de 0.8267813267813268
La precision es de 0.8428665351742275
El recall es de 0.9675471698113207
El F1 score es de 0.9009135628952916
Train set N°3


Training: 100%|██████████| 6510/6510 [00:00<00:00, 36591.32it/s]
Testing: 100%|██████████| 1627/1627 [00:51<00:00, 31.76it/s]


El accuracy total es de 0.8401966810079902
La precision es de 0.8536109303838647
El recall es de 0.9740163325909429
El F1 score es de 0.9098474341192787
Train set N°4


Training: 100%|██████████| 6510/6510 [00:00<00:00, 39851.19it/s]
Testing: 100%|██████████| 1627/1627 [00:52<00:00, 30.70it/s]


El accuracy total es de 0.8309772587584512
La precision es de 0.8515369522563767
El recall es de 0.9644444444444444
El F1 score es de 0.9044807224730809
Train set N°5


Training: 100%|██████████| 6510/6510 [00:00<00:00, 37170.84it/s]
Testing: 100%|██████████| 1627/1627 [00:53<00:00, 30.61it/s]

El accuracy total es de 0.8272894898586355
La precision es de 0.8406557377049181
El recall es de 0.9712121212121212
El F1 score es de 0.9012302284710019





## Lemmatization and stopword removal

In [14]:
preprocessed_data_stopword = []
for row in tqdm.tqdm(data, desc="Preprocessing Data"):
    preprocessed_data.append((preprocess_text(row[0], lemmatize=True, remove_stopwords=True), row[1]))

Preprocessing Data:   0%|          | 0/8137 [00:00<?, ?it/s]

Preprocessing Data: 100%|██████████| 8137/8137 [04:04<00:00, 33.31it/s]


In [15]:
for j in range(0, 5):
    train_test(preprocessed_data_stopword, j, 'lemmatization and stopword removal')

Train set N°1


Training: 100%|██████████| 6509/6509 [00:00<00:00, 45952.93it/s]
Testing: 100%|██████████| 1628/1628 [00:24<00:00, 66.33it/s]


El accuracy total es de 0.8421375921375921
La precision es de 0.8632760898282695
El recall es de 0.9631540162122328
El F1 score es de 0.9104841518634623
Train set N°2


Training: 100%|██████████| 6509/6509 [00:00<00:00, 59466.27it/s]
Testing: 100%|██████████| 1628/1628 [00:24<00:00, 67.20it/s]


El accuracy total es de 0.8200245700245701
La precision es de 0.837696335078534
El recall es de 0.9660377358490566
El F1 score es de 0.8973010865755345
Train set N°3


Training: 100%|██████████| 6510/6510 [00:00<00:00, 51990.75it/s]
Testing: 100%|██████████| 1627/1627 [00:23<00:00, 68.74it/s]


El accuracy total es de 0.8377381684081131
La precision es de 0.8518518518518519
El recall es de 0.9732739420935412
El F1 score es de 0.9085239085239085
Train set N°4


Training: 100%|██████████| 6510/6510 [00:00<00:00, 46839.13it/s]
Testing: 100%|██████████| 1627/1627 [00:27<00:00, 59.76it/s]


El accuracy total es de 0.8291333743085433
La precision es de 0.8489583333333334
El recall es de 0.965925925925926
El F1 score es de 0.9036729036729036
Train set N°5


Training: 100%|██████████| 6510/6510 [00:00<00:00, 55461.05it/s]
Testing: 100%|██████████| 1627/1627 [00:37<00:00, 43.04it/s]

El accuracy total es de 0.826060233558697
La precision es de 0.84
El recall es de 0.9704545454545455
El F1 score es de 0.9005272407732864





In [16]:
results_df.to_csv('results_variations.csv', index=False)

## Predict all 'high'

If we were to always predict 'high', what would out metrics be?

In [17]:
priors_count = defaultdict(int)
for row in tqdm.tqdm(data):
    label = row[1]
    priors_count[label] += 1

priors_probs = {cls: priors_count[cls] / sum(priors_count.values()) for cls in priors_count}
print(priors_probs)

100%|██████████| 8137/8137 [00:00<00:00, 903697.81it/s]

{'high': 0.8232763917905862, 'low': 0.1767236082094138}





There's an 82.3% probability that the review is 'high', so if we were to predict 'high' every time, we would be right 82% of the time

In [18]:
recall = priors_probs['high']*len(data)/len(data)
print(recall)

0.8232763917905862


Recall would be the same as accuracy, because we'd get right about 82% of our predictions, which is 0.82*(number of predictions), and when we normalize by our (true positives + false positives), that is our number of predictions

In [19]:
precision = priors_probs['high']*len(data)/(priors_probs['high']*len(data) + priors_probs['low']*len(data))
print(precision)

0.8232763917905862


Again, precision would be the same as accuracy and recall, because our false positives + our true positives are equal to our entire dataset

In [20]:
F1 = 2 * (precision * recall) / (precision + recall)
print(F1)

0.8232763917905862


Finally, our F1 score would be the same as the accuracy, precision and recall. This gives us our benchmark so that we can say if our models are better than simply predicting 'high' all the time

Our models are slightly more accurate than the most basic "predict 'high' all the time". Precision is also a bit better, not too much difference. However, recall shows a great improvement. The tokenized, lemmatized and stopword removed models aren't that much better than the model without text pre-processing. However, the (slightly) best model is one with lemmatization and stopword removal.

In [23]:
priors_probs, likelihood, vocabulary = train_test(preprocessed_data_stopword, 0)

Train set N°1


Training: 100%|██████████| 6509/6509 [00:00<00:00, 31167.91it/s]
Testing: 100%|██████████| 1628/1628 [00:26<00:00, 61.63it/s]

El accuracy total es de 0.8421375921375921
La precision es de 0.8632760898282695
El recall es de 0.9631540162122328
El F1 score es de 0.9104841518634623






In [24]:
data_test =[]
with open('yelp_reviews_test.csv', 'r') as csv_file:
    file = csv.reader(csv_file)
    for row in file:
        data_test.append(row)

In [25]:
preprocessed_test = []
for row in tqdm.tqdm(data_test, desc="Preprocessing Data"):
    preprocessed_test.append(preprocess_text(row[0], lemmatize=True, remove_stopwords=True))

Preprocessing Data: 100%|██████████| 1863/1863 [00:59<00:00, 31.40it/s]


In [26]:
predictions = []
for words in tqdm.tqdm(preprocessed_test, desc = 'Testing:'):
    calc_prob = {cls: math.log(priors_probs[cls]) for cls in priors_probs}
    for word in words:
        if word in vocabulary:
            for cls in priors_probs:
                calc_prob[cls] += math.log(likelihood[cls][word])

    prediction = 'high' if calc_prob['high'] > calc_prob['low'] else 'low'
    predictions.append(prediction)
    

Testing:: 100%|██████████| 1863/1863 [00:30<00:00, 61.82it/s]


In [27]:
final = []
i = 0
for row in data_test:
    final.append((row, predictions[i]))
    i += 1

In [28]:
final_df = pd.DataFrame(final, columns = ['text', 'prediction'])

In [29]:
final_df.to_csv('final_predictions.csv', index = False)