In [52]:
import os

# import gensim
import matplotlib.pyplot as plt
import nltk
import pandas as pd
import unicodedata
import nltk

from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer

from contractions import CONTRACTION_MAP
import re
#from nltk.book import texts
from autocorrect import Speller
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from nltk.stem import WordNetLemmatizer


def preprocess_text(text):
    def character_normalization(text):
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        return text

    def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
        contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                          flags=re.IGNORECASE | re.DOTALL)

        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = contraction_mapping.get(match) \
                if contraction_mapping.get(match) \
                else contraction_mapping.get(match.lower())
            expanded_contraction = first_char + expanded_contraction[1:]
            return expanded_contraction

        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text

    def remove_extra_new_lines(text):
        return re.sub(r'[\r|\n|\r\n]+', ' ', text)

    def case_conversion(text):
        return text.lower()

    def autocorrect(text):
        spell = Speller(fast=True)
        return spell(text)

    tokenizer = ToktokTokenizer()
    stopword_list = nltk.corpus.stopwords.words('english')

    def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
        tokens = tokenizer.tokenize(text)
        tokens = [token.strip() for token in tokens]
        if is_lower_case:
            filtered_tokens = [token for token in tokens if token not in stopwords]
        else:
            filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text

    def simple_stemmer(text):
        ps = nltk.porter.PorterStemmer()
        text = ' '.join([ps.stem(word) for word in text.split()])
        return text
    

    def lemmatize_text(text):
        lemmatizer = WordNetLemmatizer()
        return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


    def special_char_removal(text, remove_digits=False):
        def remove_special_characters(text, remove_digits):
            pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
            text = re.sub(pattern, '', text)
            return text

        special_char_pattern = re.compile(r'([{.(-)!}])')
        text = special_char_pattern.sub(" \\1 ", text)
        return remove_special_characters(text, remove_digits)

    def extra_white_space_removal(text):
        return re.sub(' +', ' ', text)

    # Apply the preprocessing steps
    text = character_normalization(text)
    text = expand_contractions(text)
    text = case_conversion(text)
    text = remove_extra_new_lines(text)
    # text = simple_stemmer(text)
    text = lemmatize_text(text)
    text = special_char_removal(text, remove_digits=True)
    text = extra_white_space_removal(text)
    text = remove_stopwords(text, is_lower_case=True, stopwords=stopword_list)
    text = autocorrect(text)

    return text


def text_to_df(file_path):
    # reading the data
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Split each line on the tab character
            parts = line.strip().split('\t')
            if len(parts) == 2:  # Ensure the line has exactly two parts
                data.append(parts)
            # Convert the processed data into a DataFrame
    return pd.DataFrame(data)
            
# Importing data
df_amazon = text_to_df('../Data/sentiment labelled sentences/amazon_cells_labelled.txt')

df_imdb = text_to_df('../Data/sentiment labelled sentences/imdb_labelled.txt')



df_yelp = text_to_df('../Data/sentiment labelled sentences/yelp_labelled.txt')
df = pd.concat([df_yelp, df_imdb, df_amazon], ignore_index=True)
df.columns = ['Review', 'Score']
print(df)
# Preprocessing
df['Review'] = df['Review'].apply(preprocess_text) 

                                                                                       Review  \
0                                                                    Wow... Loved this place.   
1                                                                          Crust is not good.   
2                                                   Not tasty and the texture was just nasty.   
3     Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.   
4                                 The selection on the menu was great and so were the prices.   
...                                                                                       ...   
2995                 The screen does get smudged easily because it touches your ear and face.   
2996                                  What a piece of junk.. I lose more calls on this phone.   
2997                                                             Item Does Not Match Picture.   
2998                          

In [53]:

# Tokenization
from nltk.tokenize import word_tokenize

# Ensure NLTK tokenizers are downloaded
nltk.download('punkt')

# Tokenize each review into a list of words
df['Tokens'] = df['Review'].apply(word_tokenize)
 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/janjelinek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [54]:
df  

Unnamed: 0,Review,Score,Tokens
0,wow loved place,1,"[wow, loved, place]"
1,crust good,0,"[crust, good]"
2,tasty texture wa nasty,0,"[tasty, texture, wa, nasty]"
3,stopped late may bank holiday rick steve recommendation loved,1,"[stopped, late, may, bank, holiday, rick, steve, recommendation, loved]"
4,selection menu wa great prices,1,"[selection, menu, wa, great, prices]"
...,...,...,...
2995,screen doe get smudged easily touch ear face,0,"[screen, doe, get, smudged, easily, touch, ear, face]"
2996,piece junk lose call phone,0,"[piece, junk, lose, call, phone]"
2997,item doe match picture,0,"[item, doe, match, picture]"
2998,thing disappoint infra red port rida,0,"[thing, disappoint, infra, red, port, rida]"


In [55]:
# glove_pretrained = gensim.downloader.load('glove-wiki-gigaword-300')


In [56]:
import numpy as np

# Load the GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings = load_glove_embeddings('glove.6B.300d.txt')

# Convert text to GloVe embeddings
def get_glove_embedding(text, embeddings, embedding_dim=300):
    words = text.split()
    embedding = np.zeros(embedding_dim)
    count = 0
    for word in words:
        if word in embeddings:
            embedding += embeddings[word]
            count += 1
    if count > 0:
        embedding /= count
    return embedding

# Convert tokens to GloVe embeddings
df['Embedding'] = df['Tokens'].apply(lambda tokens: get_glove_embedding(' '.join(tokens), glove_embeddings))
X = np.vstack(df['Embedding'].values)
y = df['Score'].astype(int).values  # Convert to a NumPy array



In [57]:
from sklearn.model_selection import train_test_split

# Separating training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the model
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Train-test split
from sklearn.model_selection import train_test_split
y = df['Score'].astype(int).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_}")

# Evaluate on test set
best_rf = grid_search.best_estimator_
accuracy = best_rf.score(X_test, y_test)
print(f"Test Set Accuracy: {accuracy}")

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7904166666666667
Test Set Accuracy: 0.7933333333333333
