In [1]:
import random
import spacy
import csv
import ast
import math
import tqdm
from scipy.spatial.distance import cosine
import numpy as np
import string
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
data = []
with open('../hw3/yelp_reviews_train.csv', 'r') as csv_file:
    file = csv.reader(csv_file)
    for row in file:
        data.append(row)
random.shuffle(data)

In [4]:
def preprocess_text(text, lemmatize=True, remove_stopwords=False):
    doc = nlp(text.lower())
    # Tabla de traducción para eliminar la puntuación
    table = str.maketrans('', '', string.punctuation + '1234567890\n©|° ')
    
    if lemmatize:
        if remove_stopwords:
            return [token.lemma_.translate(table) for token in doc if not token.is_stop and not token.is_punct and not token.is_digit]
        return [token.lemma_.translate(table) for token in doc if not token.is_punct and not token.is_digit]

In [29]:
#preprocessed_data = []
#for row in tqdm.tqdm(data, desc="Preprocessing Data"):
#    preprocessed_data.append((preprocess_text(row[0], lemmatize = True, remove_stopwords = False), row[1]))

Preprocessing Data: 100%|██████████| 8137/8137 [03:59<00:00, 34.04it/s]


In [36]:
# Guarde el csv con las reviews limpias para hacer más rápido
#with open('clean_yelp_train.csv', 'w', newline='') as csv_file:
#    writer = csv.writer(csv_file)
#    writer.writerows(preprocessed_data)

In [5]:
clean_data = []
with open('clean_yelp_train.csv', 'r', newline='') as csv_file:
    file = csv.reader(csv_file)
    for row in file:
        row[0] = ast.literal_eval(row[0])
        clean_data.append(row)

In [6]:
def train_test(data, j):
    IDX = list(range(len(data)))
    test_set = []
    train_set = []

    for i in IDX:
        if i % 5 == j:
            test_set.append(data[i])
        else:
            train_set.append(data[i])
    return test_set, train_set


In [7]:
def tfidf(data, vocabulary = None):
    if vocabulary is None:
        idf_counts = {}
        for words, label in data: # Itero sobre cada review
            vocab = list(set(words)) # Agarro las palabras (no repetidas) de cada review
            for word in vocab: # Itero sobre las palabras de este review
                if word not in idf_counts: # Si no está en el diccionario
                    idf_counts[word] = 0 # Agrego esa palabra como un key
                idf_counts[word] += 1 # Cuento cuantas veces aparece en todos los reviews
                
        idf_values = {}
        for word in idf_counts: # Itero sobre todos los keys del diccionario counts
            idf_values[word] = math.log(len(data)/idf_counts[word]) # Calcula (el log) del ratio 'cantidad de reviews'/'cantidad de veces que aparece una palabra'
        # Este es el weight que le vamos a dar a las palabras, para que las que son comunes a todos los reviews tengan menos peso
    else:
        idf_counts = {}  # Inicializa IDF para el vocabulario proporcionado
        for words, label in data:
            for word in words:  # Solo contar palabras únicas
                if word in vocabulary and word not in idf_counts: # Si está en el vocabulario y no en el diccionario
                    idf_counts[word] = 1  # Cuenta palabras en el vocabulario
                elif word in vocabulary and word in idf_counts: # Si estaba en el vocabulario y el diccionario, le suma 1
                    idf_counts[word] += 1
    
        idf_values = {}
        for word in idf_counts:  # Calcula el IDF final
            idf_values[word] = math.log(len(data) / (idf_counts[word]))  # Evitar división por cero

    tf_counts = {}
    i = 0
    for words, label in data: # Itero sobre cada review
        if i not in tf_counts: # Esto lo hago para asignarle un numero a cada review
            tf_counts[i] = {} # Si el numero no está como key en el diccionario, lo agrego.
            for word in words: # Itero sobre las palabras de este review
                if word not in tf_counts[i]: # Si la palabra todavía no se encuentra como valor para esta review
                    tf_counts[i][word] = 0 # La agrego
                tf_counts[i][word] += 1
        i += 1
    tf_value= {}
    i = 0
    for words, label in data: # Itero sobre cada review
        tf_value[i] = {} # Agrego el mismo key
        vocab = list(set(words)) # El vocabulario de este review
        for word in tf_counts[i]: # Itero sobre cada palabra en el diccionario counts para el mismo key
            tf_value[i][word] = tf_counts[i][word]/len(words) # Calcula que porcentaje ocupa esa palabra en el review
        i += 1
    tfidf_vectors = {}
    word_idx = {}
    vocab = list(idf_values.keys()) # Nuestro vocabulario son todas las palabras que aparecieron en todas las reviews
    vocab.sort()
    for j in range(len(vocab)):
        word = vocab[j]
        word_idx[word] = j # A cada palabra se le asigna un numero

    for i in tf_value: # Itero sobre cada review
        tfidf_vectors[i] = []
        for word in vocab: # Itera sobre todas las palabras que vimos
            if word in tf_value[i]: # Si la palabra está dentro de esa review
                tfidf_score = tf_value[i][word]*idf_values[word] # Le asigna el tfidf_score
            else:
                tfidf_score = 0 # Si esa palabra no aparece en este review, le asigna un 0
            tfidf_vectors[i].append(tfidf_score) # El valor dentro del diccionario, asignado al numero del review es una lista con el score para cada palabra del vocabulario
    df = pd.DataFrame(tfidf_vectors, word_idx.keys()).T
    return df, vocab

# Clasification

In [8]:
test_set, train_set = train_test(clean_data, 0)

In [9]:
train_tfidf, vocab = tfidf(train_set)
test_tfidf, vocab = tfidf(test_set, vocabulary = vocab)

In [10]:
missing_columns = list(set(train_tfidf.columns) - set(test_tfidf.columns))
missing_columns_df = pd.DataFrame(0, index=test_tfidf.index, columns=missing_columns)

test_tfidf = pd.concat([test_tfidf, missing_columns_df], axis=1)
test_tfidf = test_tfidf[train_tfidf.columns]

In [11]:
train_labels = [label for _, label in train_set]
test_labels = [label for _, label in test_set]

In [12]:
def metrics(test_labels, predictions):
    eval = {'test_labels': test_labels,
        'predictions': predictions}

    true_positives = 0
    false_positives = 0
    for label, prediction in zip(eval['test_labels'], eval['predictions']):
        if prediction == 'high':
            if label == 'high':
                true_positives += 1
            else:
                false_positives += 1 


    positives = [positive for positive in test_labels if positive == 'high']

    precision = true_positives/(true_positives + false_positives)
    recall = true_positives/len(positives)
    accuracy = accuracy_score(test_labels, predictions)
    return(accuracy, precision, recall)


## Logistic Regression

In [13]:
log_reg = LogisticRegression()
log_reg.fit(train_tfidf, train_labels)
predictions = log_reg.predict(test_tfidf)

In [14]:
accuracy, precision, recall = metrics(test_labels, predictions)

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Presicion: {precision}")

Accuracy: 0.8157248157248157
Recall: 1.0
Presicion: 0.8156115550092194


## K-Nearest Neighbors

In [15]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(train_tfidf, train_labels)
knn_predictions = knn.predict(test_tfidf)

In [16]:
accuracy, precision, recall = metrics(test_labels, knn_predictions)
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Presicion: {precision}")

Accuracy: 0.8126535626535627
Recall: 0.9962321024868124
Presicion: 0.815043156596794


## Random Forests

In [17]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_tfidf, train_labels)
rf_predictions = rf.predict(test_tfidf)

In [18]:
accuracy, precision, recall = metrics(test_labels, rf_predictions)
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Presicion: {precision}")

Accuracy: 0.816953316953317
Recall: 1.0
Presicion: 0.8166153846153846


# K-fold Cross Validation

### Logistic Regression

In [20]:
for j in range(0,5):
    test_set, train_set = train_test(clean_data, j)
    train_tfidf, vocab = tfidf(train_set)
    test_tfidf, vocab = tfidf(test_set, vocabulary = vocab)
    
    missing_columns = list(set(train_tfidf.columns) - set(test_tfidf.columns))
    missing_columns_df = pd.DataFrame(0, index=test_tfidf.index, columns=missing_columns)
    test_tfidf = pd.concat([test_tfidf, missing_columns_df], axis=1)
    test_tfidf = test_tfidf[train_tfidf.columns]
    
    train_labels = [label for _, label in train_set]
    test_labels = [label for _, label in test_set]
    
    log_reg = LogisticRegression()
    log_reg.fit(train_tfidf, train_labels)
    predictions = log_reg.predict(test_tfidf)
    
    accuracy, precision, recall = metrics(test_labels, predictions)
    
    print(f'Modelo N° {j+1}')
    print(f"Accuracy: {accuracy}")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")

Modelo N° 1
Accuracy: 0.8157248157248157
Recall: 1.0
Precision: 0.8156115550092194
Modelo N° 2
Accuracy: 0.8273955773955773
Recall: 1.0
Precision: 0.8273955773955773
Modelo N° 3
Accuracy: 0.8408113091579594
Recall: 1.0
Precision: 0.8408113091579594
Modelo N° 4
Accuracy: 0.8180700676090965
Recall: 0.9992492492492493
Precision: 0.8185731857318573
Modelo N° 5
Accuracy: 0.8149969268592502
Recall: 1.0
Precision: 0.8148831488314883


### K-Nearest Neighbors

In [21]:
for j in range(0,5):
    test_set, train_set = train_test(clean_data, j)
    train_tfidf, vocab = tfidf(train_set)
    test_tfidf, vocab = tfidf(test_set, vocabulary = vocab)
    
    missing_columns = list(set(train_tfidf.columns) - set(test_tfidf.columns))
    missing_columns_df = pd.DataFrame(0, index=test_tfidf.index, columns=missing_columns)

    test_tfidf = pd.concat([test_tfidf, missing_columns_df], axis=1)
    test_tfidf = test_tfidf[train_tfidf.columns]
    
    train_labels = [label for _, label in train_set]
    test_labels = [label for _, label in test_set]
    
    knn = KNeighborsClassifier(n_neighbors = 5)
    knn.fit(train_tfidf, train_labels)
    knn_predictions = knn.predict(test_tfidf)
    
    accuracy, precision, recall = metrics(test_labels, knn_predictions)
    
    print(f'Modelo N° {j+1}')
    print(f"Accuracy: {accuracy}")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")

Modelo N° 1
Accuracy: 0.8126535626535627
Recall: 0.9962321024868124
Precision: 0.815043156596794
Modelo N° 2
Accuracy: 0.8224815724815725
Recall: 0.9896065330363771
Precision: 0.8289800995024875
Modelo N° 3
Accuracy: 0.8371235402581438
Recall: 0.993421052631579
Precision: 0.8414860681114551
Modelo N° 4
Accuracy: 0.8119237861094039
Recall: 0.990990990990991
Precision: 0.8178438661710037
Modelo N° 5
Accuracy: 0.8070067609096496
Recall: 0.9871698113207548
Precision: 0.8149532710280374


### Random Forests

In [22]:
for j in range(0,5):
    test_set, train_set = train_test(clean_data, j)
    train_tfidf, vocab = tfidf(train_set)
    test_tfidf, vocab = tfidf(test_set, vocabulary = vocab)
    
    missing_columns = list(set(train_tfidf.columns) - set(test_tfidf.columns))
    missing_columns_df = pd.DataFrame(0, index=test_tfidf.index, columns=missing_columns)

    test_tfidf = pd.concat([test_tfidf, missing_columns_df], axis=1)
    test_tfidf = test_tfidf[train_tfidf.columns]
    
    train_labels = [label for _, label in train_set]
    test_labels = [label for _, label in test_set]
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(train_tfidf, train_labels)
    rf_predictions = rf.predict(test_tfidf)
    
    accuracy, precision, recall = metrics(test_labels, rf_predictions)
    
    print(f'Modelo N° {j+1}')
    print(f"Accuracy: {accuracy}")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")

Modelo N° 1
Accuracy: 0.816953316953317
Recall: 1.0
Precision: 0.8166153846153846
Modelo N° 2
Accuracy: 0.8298525798525799
Recall: 1.0
Precision: 0.8294334975369458
Modelo N° 3
Accuracy: 0.8426551936078672
Recall: 1.0
Precision: 0.8423645320197044
Modelo N° 4
Accuracy: 0.8199139520590043
Recall: 1.0
Precision: 0.8196923076923077
Modelo N° 5
Accuracy: 0.8149969268592502
Recall: 1.0
Precision: 0.8148831488314883


# Prediction

In [8]:
test_set, train_set = train_test(clean_data, 2)
train_tfidf, vocab = tfidf(train_set)

train_labels = [label for _, label in train_set]

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_tfidf, train_labels)

In [9]:
test = []
with open('../hw3/yelp_reviews_test.csv', 'r') as csv_file:
    file = csv.reader(csv_file)
    for row in file:
        test.append(row)

In [10]:
processed_test = []
for row in tqdm.tqdm(test, desc="Preprocessing Data"):
    processed_test.append((preprocess_text(row[0], lemmatize = True, remove_stopwords = False),0))

Preprocessing Data: 100%|██████████| 1863/1863 [00:58<00:00, 31.78it/s]


In [12]:
test_tfidf, vocab = tfidf(processed_test, vocabulary = vocab)

In [13]:
missing_columns = list(set(train_tfidf.columns) - set(test_tfidf.columns))
missing_columns_df = pd.DataFrame(0, index=test_tfidf.index, columns=missing_columns)

test_tfidf = pd.concat([test_tfidf, missing_columns_df], axis=1)
test_tfidf = test_tfidf[train_tfidf.columns]

In [15]:
rf_predictions = rf.predict(test_tfidf)

In [17]:
final = []
i = 0
for row in test:
    final.append((row, rf_predictions[i]))
    i += 1

final_df = pd.DataFrame(final, columns = ['text', 'prediction'])
final_df.to_csv('final_predictions.csv', index = False)