In [1]:
# %pip install -U spacy
# %pip install deep-translator
# %pip install langdetect
# %pip install torchvision
# %pip install transformers
# !python -m spacy download en_core_web_sm
# !python -m spacy download fr_core_news_sm
# !python -m spacy download de_core_news_sm

In [2]:
from pyspark.sql import SparkSession

# Create spark session on local machine
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.bindAddress","127.0.0.1") \
    .config("spark.executor.memory", "2g")\
    .getOrCreate()

22/03/25 16:36:30 WARN Utils: Your hostname, DimitriH02.local resolves to a loopback address: 127.0.0.1; using 10.0.0.2 instead (on interface en7)
22/03/25 16:36:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/25 16:36:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from ReviewClass import Review

spark.sparkContext.addPyFile('ReviewClass.py')
spark.sparkContext.addPyFile('data_cleaning.py')

In [4]:
import pandas as pd

# Load data from csv files into dataframe
# df_train = pd.read_csv('reviews/validation_hidden.csv')
df1 = pd.read_csv('reviews/train-1.csv')
df2 = pd.read_csv('reviews/train-2.csv')
df3 = pd.read_csv('reviews/train-3.csv')
df4 = pd.read_csv('reviews/train-4.csv')
df5 = pd.read_csv('reviews/train-5.csv')
df6 = pd.read_csv('reviews/train-6.csv')
df7 = pd.read_csv('reviews/train-7.csv')
df8 = pd.read_csv('reviews/train-8.csv')

df_train = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8])

# Change name of first column
df_train.rename(columns={ df_train.columns[0]: "review_id" }, inplace = True)

In [5]:
# Add data from dataframe into spark session
reviews = [Review(**kwargs) for kwargs in df_train.head(2000).to_dict(orient='records')]

review_rdd = spark.sparkContext.parallelize(reviews, 100)

In [6]:
# PoC of total character count per category
# total_character_count_per_product_category_id = review_rdd\
#     .flatMap(lambda review: [(review.product_category_id, review.ReviewBodyCharCount())])\
#     .reduceByKey(lambda count1, count2: count1 + count2)\
#     .sortByKey()\
#     .collect()

# print(total_character_count_per_product_category_id)

In [7]:
# PoC of total word count per category
# total_word_count_per_product_category_id = review_rdd\
#     .flatMap(lambda review: [(review.product_category_id, review.ReviewBodyWordCount())])\
#     .reduceByKey(lambda count1, count2: count1 + count2)\
#     .sortByKey()\
#     .collect()

# print(total_word_count_per_product_category_id)

In [8]:
# PoC of tagged review bodies
# list_of_tagged_reviews = review_rdd\
#     .map(lambda review: (review.review_id, review.TaggedReviewBody()))\
#     .collect()

In [9]:
import re

def RemovePunctuation(review):

    new_body = str(review.review_body).replace('.', ' ')
    new_body = str(new_body).replace(',', ' ')
    new_body = re.sub(r'[^\w\s]', '', new_body)

    review.review_body = str(new_body)

    return review

In [10]:
import html

def RemoveASCII(review):

    review.review_headline = html.unescape(str(review.review_headline))
    review.review_body = html.escape(str(review.review_body))

    return review

In [11]:
def RemoveBreaklines(review):

    review.review_headline = str(review.review_headline).replace('<br />', '')
    review.review_body = str(review.review_body).replace('<br />', '')

    return review

In [12]:
import re
import numpy as np

def replace_acute_accents(text, accent_map):
    for [accent, char] in accent_map:
        text = re.sub(accent, char, text)
    return text

def RemoveAccentsFromBody(review):
    
    acute_map = np.array([['á', 'a'], ['Á', 'A'], ['é', 'e'], 
                          ['É', 'E'], ['ớ', 'o'], ['ó', 'o'], 
                          ['Ó', 'O'], ['ú', 'u'], ['Ú', 'U']])

    review.review_body = replace_acute_accents(review.review_body, acute_map)

    return review

def RemoveAccentsFromHeadline(review):
    
    acute_map = np.array([['á', 'a'], ['Á', 'A'], ['é', 'e'], 
                          ['É', 'E'], ['ớ', 'o'], ['ó', 'o'], 
                          ['Ó', 'O'], ['ú', 'u'], ['Ú', 'U']])

    review.review_headline = replace_acute_accents(review.review_headline, acute_map)

    return review

In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import detect

def RemoveStopwords(review):

    if review.ReviewBodyWordCount() < 5:
        review.lang = 0
        return review

    if detect(review.review_body) == 'fr':
        review.lang = 1
        stop_words = stopwords.words('french')
    if detect(review.review_body) == 'de':
        review.lang = 2
        stop_words = stopwords.words('german')
    else:
        review.lang = 0
        stop_words = stopwords.words('english')

    new_body = [word for word in word_tokenize(str(review.review_body)) if not word in stop_words]

    a_str = " "

    review.review_body = a_str.join(new_body)

    return review

In [14]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def Stemming(review):
    
    new_body = [WordNetLemmatizer().lemmatize(word) for word in word_tokenize(str(review.review_body))]

    a_str = " "

    review.review_body = a_str.join(new_body)

    return review


In [15]:
def Lower(review):

    review.review_body = str(review.review_body).lower()

    return review

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

def BagOfNTopWords(review, n):

    if review.ReviewBodyWordCount() < 5:
        return review

    countVec = CountVectorizer(ngram_range=(1,1))

    result = countVec.fit_transform([review.review_body]).toarray()[0]
    features = np.array(countVec.get_feature_names())

    bag = []

    for r, f in np.c_[result, features]:
        bag.append((r, f))

    bag.sort(key = lambda b: b[0], reverse = True)

    top_n = bag[0:n]

    words = [t[1] for t in top_n]

    a_str = " "

    review.bag_of_words_top_10 = a_str.join(words)

    return review


In [17]:
def ScaledPos(review):

    review.TaggedReviewBody()

    return review

In [18]:
def NumOfChar(review):
    
    review.num_of_char = review.ReviewBodyCharCount()

    return review

In [19]:
def NumOfWords(review):

    review.num_of_words = review.ReviewBodyWordCount()

    return review

In [20]:
def NumOfCapitals(review):

    count=0
    
    for i in review.review_body:
        if i.isupper():
            count += 1
    
    review.num_of_capital = count

    return review

In [21]:
def NumOfCaptialWords(review):

    review.num_of_capital_words = sum(map(str.isupper, review.review_body.split()))

    return review

In [22]:
def NumOfPunctuation(review):

    count=0

    punctuations='!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~'

    for i in review.review_body:
        if i in punctuations:
            count += 1

    review.num_of_punctuation = count

    return review

In [23]:
def NumOfWordsInQuote(review):
    x = re.findall('".*?"|\'.*?\'', review.review_body)

    count = 0

    if x is not None:
        for i in x:
            count += 1

    review.num_of_words_in_quote = count
    
    return review

In [24]:
def NumOfUniqueWord(review):

    review.num_of_unique_word = len(set(review.review_body.split()))

    return review

In [25]:
def AvgWordLength(review):

    review.avg_word_length = review.num_of_char / review.num_of_words


    return review

In [26]:
def RatioUniqueWords(review):

    review.ratio_unique_words = review.num_of_unique_word / review.num_of_words

    return review

In [27]:
def RatioStopWords(review):

    review.ratio_stopwords = review.num_of_stopwords / review.num_of_words

    return review

In [28]:
def NumOfStopWords(review):
    
    if detect(review.review_body) == 'fr':
        stop_words = stopwords.words('french')
    if detect(review.review_body) == 'de':
        stop_words = stopwords.words('german')
    else:
        stop_words = stopwords.words('english')

    stop_words = set(stop_words)
    word_tokens = word_tokenize(review.review_body)
    stopwords_x = [w for w in word_tokens if w in stop_words]

    review.num_of_stopwords = len(stopwords_x)

    return review


In [29]:
from transformers import pipeline

def Sentiment(review, classifier):

    re = GoogleTranslator(source='auto', target='en').translate(review.review_body[0:500])

    score = classifier(re)[0]['score']
    label = classifier(re)[0]['label']

    review.sen_score = score
    if label == "NEGATIVE":
        review.sen_label = -1
    elif label == "POSITIVE":
        review.sen_label = 1
    else:
        review.sen_label = 0

    return review

In [30]:
from data_cleaning import sent_tokenize

def sent_count(text):
    return len(sent_tokenize(text))

def AvgSentLength(review):
    sents = sent_tokenize(review.review_body)
    words = word_tokenize(review.review_body)
    
    review.avg_sent_length = len(words) / len(sents)

    return review

In [31]:
from deep_translator import GoogleTranslator
from langdetect import detect
import copy

def DoubleTranslate(review):

    if detect(review.review_body) == 'en':

        translated = GoogleTranslator(source='en', target='fr').translate(review.review_body)
        translated_back = GoogleTranslator(source='fr', target='en').translate(translated)
    else:

        translated = GoogleTranslator(source='auto', target='en').translate(review.review_body)
        translated_back = GoogleTranslator(source='en', target=detect(review.review_body)).translate(translated)

    translated_review = copy.deepcopy(review)

    translated_review.review_body = translated_back

    return [review, translated_review]

In [34]:
classifier = pipeline('sentiment-analysis')

cleaned_reviews = review_rdd\
    .filter(lambda review: review.review_date == review.review_date)\
    .map(lambda review: ScaledPos(review))\
    .map(lambda review: NumOfChar(review))\
    .map(lambda review: NumOfWords(review))\
    .map(lambda review: NumOfCapitals(review))\
    .map(lambda review: NumOfCaptialWords(review))\
    .map(lambda review: NumOfPunctuation(review))\
    .map(lambda review: NumOfWordsInQuote(review))\
    .map(lambda review: NumOfUniqueWord(review))\
    .map(lambda review: NumOfStopWords(review))\
    .map(lambda review: AvgWordLength(review))\
    .map(lambda review: AvgSentLength(review))\
    .map(lambda review: RatioUniqueWords(review))\
    .map(lambda review: RemovePunctuation(review))\
    .map(lambda review: RemoveASCII(review))\
    .map(lambda review: RemoveBreaklines(review))\
    .map(lambda review: RemoveAccentsFromHeadline(review))\
    .map(lambda review: RemoveAccentsFromBody(review))\
    .map(lambda review: RemoveStopwords(review))\
    .map(lambda review: Stemming(review))\
    .map(lambda review: Lower(review))\
    .map(lambda review: BagOfNTopWords(review, 10))\
    .map(lambda review: Sentiment(review, classifier))\
    .flatMap(lambda review: DoubleTranslate(review))\
    .collect()
    

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
[Stage 0:===>                                                     (7 + 1) / 100]

In [None]:
import pandas as pd

df = pd.DataFrame([o.__dict__ for o in cleaned_reviews])

df.to_csv("cleaned_reviews_with_double_translations.csv", index = False)

In [None]:
from cProfile import label
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

# use GPU if available, else use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Using {device} device")


# neural network
class NeuralNetwork(nn.Module):
    def __init__(self, no_features):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(no_features, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 2),
            # nn.Sigmoid()
        )

    def forward(self, x):
        x = x.float()
        logits = self.linear_relu_stack(x)
        return logits


# train the NN
def train(dataloader, model, loss_fn, optimizer):

    size = len(dataloader.dataset)
    model.train()
    acc = 0
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)
        acc += flat_accuracy(pred.detach(), y.detach())

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # if batch % 100 == 0:
        #     loss, current = loss.item(), batch * len(X)
        #     print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    print('train accuracy = ', acc/len(dataloader))
    return loss.item()


# evaluate on different dataset
def eval(dataloader, model, loss_fn):

    model.eval()
    total_loss = 0
    total_size = 0
    acc = 0

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y).item()
        total_loss += loss * len(y)
        total_size += len(y)
        acc += flat_accuracy(pred.detach(), y.detach())
    
    total_loss /= total_size
    print('val accuracy = ', acc/len(dataloader))
    return total_loss

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds.numpy(), axis=1).flatten()
    labels_flat = labels.numpy().flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


import torch.utils.data as data
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

def Net():

    # get dataframes and subtract values
    df_train = pd.read_csv('cleaned_reviews_with_double_translations.csv')  

    # features
    sent_train = np.array(df_train['avg_sent_length'].astype(float))
    posverb_train = np.array(df_train['pos_verb_ratio'].astype(float))
    pospropn_train = np.array(df_train['pos_propn_ratio'].astype(float))
    pos_aux_ratio_train = np.array(df_train['pos_aux_ratio'].astype(float))
    pos_adp_ratio_train = np.array(df_train['pos_adp_ratio'].astype(float))
    pos_noun_ratio_train = np.array(df_train['pos_noun_ratio'].astype(float))
    pos_num_ratio_train = np.array(df_train['pos_num_ratio'].astype(float))
    num_of_char_train = np.array(df_train['num_of_char'].astype(float))
    num_of_words_train = np.array(df_train['num_of_words'].astype(float))
    df_train['verified_purchase'] = df_train['verified_purchase'].map(lambda x: 1 if x == 'Y' else 0)
    verified_purchase = np.array(df_train['verified_purchase'])
    num_of_capital_train = np.array(df_train['num_of_capital'].astype(float))
    num_of_capital_words_train = np.array(df_train['num_of_capital_words'].astype(float))
    num_of_punctuation_train = np.array(df_train['num_of_punctuation'].astype(float))
    num_of_words_in_quote_train = np.array(df_train['num_of_words_in_quote'].astype(float))
    num_of_unique_word_train = np.array(df_train['num_of_unique_word'].astype(float))
    num_of_stopwords_train = np.array(df_train['num_of_stopwords'].astype(float))
    avg_word_length_train = np.array(df_train['avg_word_length'].astype(float))
    ratio_unique_words_train = np.array(df_train['ratio_unique_words'].astype(float))
    lang_train = np.nan_to_num(np.array(df_train['lang'].astype(float)), 0)
    sen_label = np.array(df_train['sen_label'].astype(float))
    sen_score = np.array(df_train['sen_score'].astype(float))


    X_train = np.column_stack([verified_purchase, sent_train, posverb_train, pospropn_train, pos_aux_ratio_train ,pos_adp_ratio_train, pos_noun_ratio_train, pos_num_ratio_train, num_of_char_train, num_of_words_train, num_of_capital_train, num_of_capital_words_train, num_of_punctuation_train, num_of_words_in_quote_train, num_of_unique_word_train, num_of_stopwords_train, avg_word_length_train, ratio_unique_words_train, lang_train, sen_score, sen_label])
    y_train = df_train['label'].astype(int)

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    train_data = list(zip(X_train, y_train))
    valid_data = list(zip(X_val, y_val))

    BATCH_SIZE = 16
    train_data = data.DataLoader(train_data, batch_size=BATCH_SIZE, num_workers=2)
    valid_data = data.DataLoader(valid_data, batch_size=BATCH_SIZE, num_workers=2)


    # initialize model, loss function and optimizer
    model = NeuralNetwork(no_features=21).to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    best_total_loss = torch.inf

    train_losses = []
    val_losses = []
    epochs = []

    for epoch in range(15):
        epochs.append(epoch)
        print('epoch: ', epoch)

        # train on training set
        train_loss = train(train_data, model, loss_fn, optimizer)
        train_losses.append(train_loss)
        print('train loss: ', train_loss)

        # evaluate the trained model on validation set
        total_loss = eval(valid_data, model, loss_fn)
        val_losses.append(total_loss)
        print('val loss: ', total_loss)
        print('\n')

        # if loss is lower, save new best model
        if total_loss < best_total_loss:
            best_total_loss = total_loss
            torch.save(model, 'best model J')


        # # test best model on test set
        # best_model = torch.load('best model J')
        # total_loss = eval(test_data, model, loss_fn)
        # print('test_loss: ', total_loss)

        # plt.plot(epochs, train_losses, label='train')
        # plt.plot(epochs, val_losses, label='validation')
        # plt.xlabel('epoch')
        # plt.ylabel('loss')
        # plt.legend()
        # plt.show()

# Net()