In [2]:
import spacy
import csv
import string
import tqdm
import math
import random

In [3]:
data = []
with open('yelp_reviews_train.csv', 'r') as csv_file:
    file = csv.reader(csv_file)
    for row in file:
        data.append(row)

In [4]:
random.shuffle(data)

## Without text preprocessing

In [5]:
no_pre = []
for row in tqdm.tqdm(data, desc = 'Getting words:'):
    text = row[0]
    label = row[1]
    words = text.lower().split(' ')
    no_pre.append((words, label))

Getting words:: 100%|██████████| 8137/8137 [00:00<00:00, 35171.06it/s]


In [None]:
IDX = list(range(len(no_pre)))
for j in range(0,5):
    test_set = []
    train_set = []
    for i in IDX:
        if i%5==j:
            test_set.append(no_pre[i])
        else:
            train_set.append(no_pre[i])      
    print(f'Train set N°{j+1}')
    priors_count = {} # Primero cuento la cantidad de buenas reviews y de malas reviews
    vocabulary = []
    likelihood_count = {}

    for words, label in tqdm.tqdm(train_set, desc = 'Training'):
        if label not in priors_count:
            priors_count[label] = 0 # High y low van a ser los keys del diccionario
        priors_count[label] +=1 # Cuento cuantos high y cuantos low
        if label not in likelihood_count:
            likelihood_count[label] = {} # Armo un diccionario que tiene como keys high o low
        for word in words:
            vocabulary.append(word) # Agrego cada palabra que veo a mi vocabulario
            if word not in likelihood_count[label]: # Si la palabra no está en el diccionario de esa clase, la agrego
                likelihood_count[label][word] = 0
            likelihood_count[label][word] += 1 # Cuento cuantas veces aparece esa palabra en esa clase
    
    priors_probs = {} # Calculo las probabilidades de cada label (high o low)
    for cls in priors_count: # itero sobre los keys del diccionario (high o low)
        priors_probs[cls] = priors_count[cls]/sum(priors_count.values()) # la probabilidad de una clase va a ser #clase/(#high + #low)
    
    # Tenemos una lista con todas las palabras que observamos
    # Pero en una lista pueden estar repetidas
    vocabulary = list(set(vocabulary)) # Tranformamos la lista en un set para eliminar las repetidas
    likelihood = {}
    for cls in likelihood_count: # Itero sobre los high y los low
        likelihood[cls] = {} # Para cada clase armo un diccionario
        for word in vocabulary:
            if word not in likelihood_count[cls]: # Si esa palabra que está en nuestro vocabulario no aparece nunca en esa clase, le ponemos un valor muy chico
                likelihood[cls][word] = 0 + 1/(sum(likelihood_count[cls].values()) + len(vocabulary)) # Usamos el Laplace Smoothing para que no nos haga 0 nuestras probabilidades
            else:
                likelihood[cls][word] = (likelihood_count[cls][word]+1)/(sum(likelihood_count[cls].values()) + len(vocabulary)) # Si esa palabra si está en esa clase, calculamos la probabilidad condicional
    # Quiero testear el accuracy
    accuracy = 0 # Lo fijo en 0
    total_sentences = 0
    tp = 0 # true positives
    positives = 0 # positive predictions
    priors_count_test = {}
    
    for words, label in tqdm.tqdm(test_set, desc = 'Testing'): # Agarro mi test_set
        if label not in priors_count_test:
            priors_count_test[label] = 0 # High y low van a ser los keys del diccionario
        priors_count_test[label] +=1 # Cuento cuantos high y cuantos low
        
        calc_prob = {}
        for cls in priors_count: # Para cada high y low
            if cls not in calc_prob:
                calc_prob[cls] = math.log(priors_probs[cls]) # Calculo la probabilidad de cada clase en logs para que no se me haga 0 tan rápido
            for word in words: # Itero sobre las palabras
                if word in vocabulary: # Si está en el vocabulario es porque ya la vi anteriormente
                    calc_prob[cls] = calc_prob[cls] + math.log(likelihood[cls][word]) # Si la vi anteriormente, entonces sumo la probabilidad de esa clase con la probabilidad de esa palabra
        # Esto último es la probabilidad de la clase, dadas las palabras
        
        if calc_prob['high'] > calc_prob['low']: # Veo si la probabilidad de high o de low es mas alta
            prediction = 'high' # Obtengo mis predicciones
            positives += 1
        else:
            prediction = 'low'
        if prediction == label: # Las testeo contra lo observado
            accuracy += 1 # Cuento a cuantas le pego
        if prediction == 'high' and label == 'high':
            tp += 1
        total_sentences += 1 # Cuento el total         
    acc_total = accuracy/total_sentences
    precision = tp/positives
    recall = tp/priors_count_test['high']
    f1 = 2*(precision*recall)/(precision + recall)
    print(f'La accuracy total es de {acc_total}')
    print(f'La precision es de {precision}')
    print(f'El recall es de {recall}')
    print(f'El F1 score es de {f1}')

Up until this point, the model is pretty basic, we tokenize by splitting each sentence by its spaces and we don't remove punctuation. Still it works really well? If we were to predict 'high' every time, we would be right 82% of the time.

## Using lemmatization

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
def preprocess_text(text, lemmatize = True, remove_stopwords = False):
    doc = nlp(text.lower())
    if lemmatize:
        if remove_stopwords:
            return [token.lemma_ for token in doc if not token.is_stop]
        return [token.lemma_ for token in doc]

In [11]:
preprocessed_data = []
for row in tqdm.tqdm(data, desc="Preprocessing Data"):
    preprocessed_data.append((preprocess_text(row[0], lemmatize = True, remove_stopwords = False), row[1]))

Preprocessing Data: 100%|██████████| 8137/8137 [03:57<00:00, 34.20it/s]


In [None]:
IDX = list(range(len(preprocessed_data)))
for j in range(0,5):
    test_set = []
    train_set = []
    for i in IDX:
        if i%5==j:
            test_set.append(preprocessed_data[i])
        else:
            train_set.append(preprocessed_data[i])      
    print(f'Train set N°{j+1}')
    priors_count = {} 
    vocabulary = []
    likelihood_count = {}

    for words, label in tqdm.tqdm(train_set, desc = 'Training'):
        if label not in priors_count:
            priors_count[label] = 0 
        priors_count[label] +=1
        if label not in likelihood_count:
            likelihood_count[label] = {}
            
        for word in words:
            vocabulary.append(word) 
            if word not in likelihood_count[label]: 
                likelihood_count[label][word] = 0
            likelihood_count[label][word] += 1
    
    priors_probs = {}
    for cls in priors_count:
        priors_probs[cls] = priors_count[cls]/sum(priors_count.values()) 
        
    vocabulary = list(set(vocabulary))
    likelihood = {}
    for cls in likelihood_count: 
        likelihood[cls] = {} 
        for word in vocabulary:
            if word not in likelihood_count[cls]: 
                likelihood[cls][word] = 0 + 1/(sum(likelihood_count[cls].values()) + len(vocabulary)) 
            else:
                likelihood[cls][word] = (likelihood_count[cls][word] + 1)/(sum(likelihood_count[cls].values()) + len(vocabulary)) 
    accuracy = 0 
    total_sentences = 0
    tp = 0 
    positives = 0 
    priors_count_test = {}
    for words, label in tqdm.tqdm(test_set, desc = 'Testing'): 

        if label not in priors_count_test:
            priors_count_test[label] = 0
        priors_count_test[label] +=1 
        
        calc_prob = {}
        for cls in priors_count: 
            if cls not in calc_prob:
                calc_prob[cls] = math.log(priors_probs[cls])
            for word in words: 
                if word in vocabulary: 
                    calc_prob[cls] = calc_prob[cls] + math.log(likelihood[cls][word]) 
                         
        if calc_prob['high'] > calc_prob['low']:
            prediction = 'high' 
            positives += 1
        else:
            prediction = 'low'
        if prediction == label: 
            accuracy += 1 
        if prediction == 'high' and label == 'high':
            tp += 1
        total_sentences += 1          
    acc_total = accuracy/total_sentences
    precision = tp/positives
    recall = tp/priors_count_test['high']
    f1 = 2*(precision*recall)/(precision + recall)
    print(f'La accuracy total es de {acc_total}')
    print(f'La precision es de {precision}')
    print(f'El recall es de {recall}')
    print(f'El F1 score es de {f1}')


Now we improved tokenization and we are using lemmatization, but we aren't removing the stopwords. This model is slightly better than the previous one

## Lemmatization with stopword removal

In [13]:
preprocessed_data = []
for row in tqdm.tqdm(data, desc="Preprocessing Data"):
    preprocessed_data.append((preprocess_text(row[0], lemmatize = True, remove_stopwords = True), row[1]))

Preprocessing Data: 100%|██████████| 8137/8137 [04:02<00:00, 33.57it/s]


In [None]:
IDX = list(range(len(preprocessed_data)))
for j in range(0,5):
    test_set = []
    train_set = []
    for i in IDX:
        if i%5==j:
            test_set.append(preprocessed_data[i])
        else:
            train_set.append(preprocessed_data[i])      
    print(f'Train set N°{j+1}')
    
    priors_count = {}
    likelihood_count = {}
    vocabulary = []
    
    for words, label in tqdm.tqdm(train_set, desc = 'Training'):
        if label not in priors_count:
            priors_count[label] = 0 
        priors_count[label] += 1 
        if label not in likelihood_count:
            likelihood_count[label] = {} 
        for punc in string.punctuation:
            text = text.replace(punc,'')
        doc = nlp(text)
        
        for word in words:
            vocabulary.append(word) 
            if word not in likelihood_count[label]: 
                likelihood_count[label][word] = 0
            likelihood_count[label][word] += 1 
    
    vocabulary = list(set(vocabulary)) 
    priors_probs = {} 
    for cls in priors_count: 
        priors_probs[cls] = priors_count[cls]/sum(priors_count.values()) 
    likelihood = {}
    for cls in likelihood_count: 
        likelihood[cls] = {}
        for word in vocabulary:
            if word not in likelihood_count[cls]: 
                likelihood[cls][word] = 0 + 1/(sum(likelihood_count[cls].values()) + len(vocabulary))
            else:
                likelihood[cls][word] = (likelihood_count[cls][word]+1)/(sum(likelihood_count[cls].values()) + len(vocabulary))   
    
    accuracy = 0 
    total_sentences = 0
    tp = 0 
    positives = 0 
    priors_count_test = {}
    for words, label in tqdm.tqdm(test_set, desc = 'Testing'):

        if label not in priors_count_test:
            priors_count_test[label] = 0
        priors_count_test[label] +=1 
        
        calc_prob = {}
        for cls in priors_count:
            if cls not in calc_prob:
                calc_prob[cls] = math.log(priors_probs[cls]) 
            for word in words: 
                if word in vocabulary: 
                    calc_prob[cls] = calc_prob[cls] + math.log(likelihood[cls][word])
        
        if calc_prob['high'] > calc_prob['low']: 
            prediction = 'high'
            positives += 1
        else:
            prediction = 'low'
        if prediction == label: 
            accuracy += 1
        if prediction == 'high' and label == 'high':
            tp += 1
        total_sentences += 1          
    acc_total = accuracy/total_sentences
    precision = tp/positives
    recall = tp/priors_count_test['high']
    f1 = 2*(precision*recall)/(precision + recall)
    print(f'La accuracy total es de {acc_total}')
    print(f'La precision es de {precision}')
    print(f'El recall es de {recall}')
    print(f'El F1 score es de {f1}')
