In [4]:
import pandas as pd
import fasttext
from typing import List, Callable, Tuple
import seaborn as sns
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from tqdm import tqdm
import os
import contractions
import concurrent.futures
import re
import keras
from keras import models,layers,optimizers
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from bs4 import BeautifulSoup
from keras.layers import TextVectorization, Input, Conv1D

In [1]:
BASE_FOLDER = 'resources'
PREPROCESSED_DATA = 'preprocessed'
DATASET = f'{BASE_FOLDER}/dataset.csv'
ONE_LANGUAGE_DATASET = f'{BASE_FOLDER}/{PREPROCESSED_DATA}/only_english_dataset.csv'
PREPROCESSED_DATASET = f'{BASE_FOLDER}/{PREPROCESSED_DATA}/preprocessed_dataset.csv'
COLUMN_TO_CONSIDER = 'sentiment'
MAX_STR_LENGTH = 300

In [11]:
def apply_concurrent(function: Callable[[str], str], workers:int = os.cpu_count()):
  def _preprocess_series(series: pd.Series):
    with concurrent.futures.ThreadPoolExecutor(workers) as executor:
        return pd.Series(list(executor.map(function, series.values)))
  return _preprocess_series

def compute_languages():
  model = fasttext.load_model('resources/lid.176.bin')
  def _compute_languages(texts: pd.Series):
    return texts.apply(lambda x:model.predict(x)[0][0])
  return _compute_languages

def remove_not_modal_languages(extracted_languages: pd.Series):
  def _remove_languages(dataframe: pd.DataFrame):
    modal_language = extracted_languages.value_counts().index[0]
    return dataframe[extracted_languages == modal_language].reset_index(drop=True)
  return _remove_languages

def remove_lxml():
    def _remove_lxml(text):
        return BeautifulSoup(text, 'lxml').get_text().strip()
    return _remove_lxml

def expand():
    def _expand(text):
        return contractions.fix(text)
    return _expand
    
def map_text(mapping_function):
    def _map_text(text):
        return mapping_function(text)
    return _map_text

def remove_links():
    def _remove_links(text):
        return re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', "", text)
    return _remove_links
  
def remove_mails():
    def _remove_mails(text):
        return re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)', "", text)
    return _remove_mails      


def generic_regex(pattern, repl):
    def _generic_regex(text):
        return re.sub(pattern, repl, text)
    return _generic_regex

nltk.download('stopwords')

def stopwords_list(language='english'):
    return stopwords.words(language)


def preprocess():
    stopword_regex = r'\b(' + r'|'.join(stopwords_list()) + r')\b\s*'
    remove_lxml_fn = remove_lxml()
    def _preprocess(text):
        functions = [
            map_text(lambda x: x.lower()),
            remove_links(), # remove links
            remove_mails(), # remove mails
            remove_lxml_fn,
            #expand(),
            generic_regex('[^a-z]', ' '), # remove all non chars
            generic_regex(r'\s+[a-z]\s+', ' '), # remove all isolated characters
            generic_regex(stopword_regex, ''), # remove stopwords
            generic_regex(r'\s+', ' ') # remove extra white spaces
        ]
        for function in functions:
            text = function(text)   
        return text
    return _preprocess

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gardo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
dataset = pd.read_csv('resources/train.csv')
dataset.columns = ['sentiment', 'title', 'text']
dataset = dataset.head(500_000)
dataset['text'] = dataset['title'] + " " + dataset['text']
del dataset['title']
dataset = dataset.dropna()
languages = compute_languages()(dataset['text'])
dataset = remove_not_modal_languages(languages)(dataset)
dataset = dataset.head(500_000)



In [32]:
def normalise(texts):
    return [re.compile(r"[^a-z0-1\s]").sub(r" ",re.compile(r"['\W']").sub(r" ",text.lower())) for text in texts]

In [34]:
dataset['text'] = normalise(dataset['text'].values)

In [4]:
if not os.path.exists(ONE_LANGUAGE_DATASET):
    print('Creating one language dataset')
    dataset = pd.read_csv(DATASET)
    dataset.columns = ['stars', 'title', 'text']
    
    dataset['text'] = dataset['title'] + " " + dataset['text']
    del dataset['title']
    
    dataset = dataset.dropna()
    
    languages = compute_languages()(dataset['text'])
    dataset = remove_not_modal_languages(languages)(dataset)
    
    dataset['sentiment'] = dataset['stars'].replace({1: 1, 2: 1, 3: 2, 4: 3, 5: 3})
    
    dataset.to_csv(ONE_LANGUAGE_DATASET, index=False)
    print('One language dataset saved')
else:
    dataset = pd.read_csv(ONE_LANGUAGE_DATASET)
    print('One language dataset loaded')

One language dataset loaded


In [5]:
dataset

Unnamed: 0,stars,text,sentiment
0,5,Inspiring I hope a lot of people hear this cd....,3
1,5,The best soundtrack ever to anything. I'm read...,3
2,4,Chrono Cross OST The music of Yasunori Misuda ...,3
3,5,Too good to be true Probably the greatest soun...,3
4,5,There's a reason for the price There's a reaso...,3
...,...,...,...
2991668,1,Don't do it!! The high chair looks great when ...,1
2991669,2,"Looks nice, low functionality I have used this...",1
2991670,2,"compact, but hard to clean We have a small hou...",1
2991671,3,Hard to clean! I agree with everyone else who ...,2


In [58]:
if not os.path.exists(PREPROCESSED_DATASET):
    print('Preprocessing dataset')
    preprocessed_values = [preprocess()(text) for text in tqdm(dataset['text'], 
                                                               position=0, 
                                                               leave=True)]
    dataset['text'] = preprocessed_values
    dataset.to_csv(PREPROCESSED_DATASET, index=False)
    print('Preprocessed dataset saved')
else:
    dataset = pd.read_csv(PREPROCESSED_DATASET)
    print('Preprocessed dataset loaded')

Preprocessed dataset loaded


In [59]:
dataset['text'][:3]

0    inspiring hope lot of people hear this cd  we ...
1    the best soundtrack ever to anything m reading...
2    chrono cross ost the music of yasunori misuda ...
Name: text, dtype: object

In [60]:
# # Create and print a Reviews length distribution graph.
# review_length_distribution_plt = pd.DataFrame(dataset["text"].str.len())
# review_length_distribution_plt = review_length_distribution_plt[review_length_distribution_plt['text'] < 5000]
# review_length_distribution_plt.groupby(["text"])
# review_length_distribution_plt = review_length_distribution_plt.plot(kind='hist', 
#                                                                      legend=None, 
#                                                                      bins=50, 
#                                                                      figsize=(12, 6))
# review_length_distribution_plt.set_xlabel("Review Length")
# review_length_distribution_plt.set_ylabel("Count")

In [35]:
n_classes = len(set(dataset[COLUMN_TO_CONSIDER]))
X = dataset['text'].astype(str).values
Y = (dataset[COLUMN_TO_CONSIDER] - 1).values
if n_classes == 2:
    n_classes = 1
    activation_function = 'sigmoid'
    loss = 'binary_crossentropy'
    y = Y
else:
    activation_function = 'softmax'
    loss = 'categorical_crossentropy'
    y = keras.utils.to_categorical(dataset[COLUMN_TO_CONSIDER] - 1, num_classes=n_classes)

# train_x, test_x, train_y, test_y = train_test_split(X, y, 
#                                                     test_size=0.2, 
#                                                     stratify=y, 
#                                                     random_state=1234)

train_x, validation_x, train_y, validation_y = train_test_split(train_x, train_y, 
                                                    test_size=0.2, 
                                                    stratify=train_y, 
                                                    random_state=1234)

In [41]:
# train_size = len(train_y)
# validation_size = len(validation_y)
# test_size = len(test_y)
# sizes = [train_size, validation_size, test_size]
# print(f'Train size: {train_size}, validation size: {validation_size}, test size: {test_size}')
# plt.figure(figsize=(8, 4))
# plt.barh([''], [100], color='white')  # Create a white bar for the total percentage
# plt.barh([''], [train_size], label='Train', color='blue')
# plt.barh([''], [validation_size], left=train_size, label='Validation', color='green')
# plt.barh([''], [test_size], left=train_size+validation_size, label='Test', color='red')
# 
# plt.title('Dataset divisions')
# plt.legend(loc='upper right')
# plt.xticks([])
# # Display the chart
# plt.show()

In [36]:
mlen = max(len(train_ex) for train_ex in train_x)
vectorize_layer = TextVectorization(max_tokens=10_000, output_sequence_length=mlen)
vectorize_layer.adapt(train_x)

In [41]:
# tokenizer=Tokenizer(num_words=1_000)
# tokenizer.fit_on_texts(train_x)
# train_texts = tokenizer.texts_to_sequences(train_x)
# val_texts = tokenizer.texts_to_sequences(validation_x)
# test_texts = tokenizer.texts_to_sequences(test_x)

In [42]:
# mlen = max(len(train_ex) for train_ex in train_texts)
# train_texts = pad_sequences(train_texts, maxlen=mlen)
# val_texts = pad_sequences(val_texts, maxlen=mlen)
# test_texts = pad_sequences(test_texts, maxlen=mlen)

In [39]:
from keras.src.layers import Embedding, BatchNormalization, MaxPooling1D, GlobalMaxPooling1D, Activation, Dense, Dropout
from keras import Sequential


def build_model(n_classes):
    model = Sequential([
        Input(shape=(1,), dtype='string'),
        vectorize_layer,
        Embedding(10_000, 64),
        
        Conv1D(256, 3),
        BatchNormalization(),
        Activation('relu'),
        MaxPooling1D(3),
        
        Conv1D(128, 5),
        Activation('relu'),
        BatchNormalization(),
        MaxPooling1D(5),
        
        Conv1D(64, 5),
        Activation('relu'),
        BatchNormalization(),
        GlobalMaxPooling1D(),
        
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(n_classes, activation=activation_function)
    ])
    model.compile(optimizer='adam',loss=loss,metrics=['binary_accuracy'])
    return model

def build_model2():
    sequences = layers.Input(shape=(1,), dtype='string')
    x = vectorize_layer(sequences)
    embedded = layers.Embedding(12000, 64)(x)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])
    return model

In [40]:
from keras.src.callbacks import EarlyStopping

with tf.device('/GPU:0'):
    model = build_model2(n_classes)
    model.fit(train_x, train_y,
              batch_size=128,
              epochs=100,
              validation_data=(validation_x, validation_y),
              callbacks=[EarlyStopping(monitor='val_loss', patience=10, verbose=1)]
)

Epoch 1/100
  29/2493 [..............................] - ETA: 10:39 - loss: 0.9432 - binary_accuracy: 0.5202

KeyboardInterrupt: 