In [93]:
import os

# import gensim
import matplotlib.pyplot as plt
# import nltk
import pandas as pd
import unicodedata
import nltk

from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer

from contractions import CONTRACTION_MAP
import re
#from nltk.book import texts
from autocorrect import Speller
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from nltk.stem import WordNetLemmatizer


def preprocess_text(text):
    def character_normalization(text):
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        return text

    def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
        contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                          flags=re.IGNORECASE | re.DOTALL)

        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = contraction_mapping.get(match) \
                if contraction_mapping.get(match) \
                else contraction_mapping.get(match.lower())
            expanded_contraction = first_char + expanded_contraction[1:]
            return expanded_contraction

        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text

    def remove_extra_new_lines(text):
        return re.sub(r'[\r|\n|\r\n]+', ' ', text)

    def case_conversion(text):
        return text.lower()

    def autocorrect(text):
        spell = Speller(fast=True)
        return spell(text)

    tokenizer = ToktokTokenizer()
    stopword_list = nltk.corpus.stopwords.words('english')

    def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
        tokens = tokenizer.tokenize(text)
        tokens = [token.strip() for token in tokens]
        if is_lower_case:
            filtered_tokens = [token for token in tokens if token not in stopwords]
        else:
            filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text

    def simple_stemmer(text):
        ps = nltk.porter.PorterStemmer()
        text = ' '.join([ps.stem(word) for word in text.split()])
        return text
    

    def lemmatize_text(text):
        lemmatizer = WordNetLemmatizer()
        return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


    def special_char_removal(text, remove_digits=False):
        def remove_special_characters(text, remove_digits):
            pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
            text = re.sub(pattern, '', text)
            return text

        special_char_pattern = re.compile(r'([{.(-)!}])')
        text = special_char_pattern.sub(" \\1 ", text)
        return remove_special_characters(text, remove_digits)

    def extra_white_space_removal(text):
        return re.sub(' +', ' ', text)

    # Apply the preprocessing steps
    text = character_normalization(text)
    text = expand_contractions(text)
    text = case_conversion(text)
    text = remove_extra_new_lines(text)
    # text = simple_stemmer(text)
    text = lemmatize_text(text)
    text = special_char_removal(text, remove_digits=True)
    text = extra_white_space_removal(text)
    text = remove_stopwords(text, is_lower_case=True, stopwords=stopword_list)
    text = autocorrect(text)

    return text


def text_to_df(file_path):
    # reading the data
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Split each line on the tab character
            parts = line.strip().split('\t')
            if len(parts) == 2:  # Ensure the line has exactly two parts
                data.append(parts)
            # Convert the processed data into a DataFrame
    return pd.DataFrame(data)
            
# Importing data
df_amazon = text_to_df('../Data/sentiment labelled sentences/amazon_cells_labelled.txt')

df_imdb = text_to_df('../Data/sentiment labelled sentences/imdb_labelled.txt')



df_yelp = text_to_df('../Data/sentiment labelled sentences/yelp_labelled.txt')
df = pd.concat([df_yelp, df_imdb, df_amazon], ignore_index=True)
df.columns = ['Review', 'Score']
print(df)

# Check if preprocessed pickle file exists
pickle_file = 'preprocessed_dataframe.pkl'

if os.path.exists(pickle_file):
    # Load the DataFrame from the pickle file
    df = pd.read_pickle(pickle_file)
else:
    df['Review'] = df['Review'].apply(preprocess_text)
    # Save the preprocessed DataFrame as a pickle file
    df.to_pickle(pickle_file)

print(df)

                                                 Review Score
0                              Wow... Loved this place.     1
1                                    Crust is not good.     0
2             Not tasty and the texture was just nasty.     0
3     Stopped by during the late May bank holiday of...     1
4     The selection on the menu was great and so wer...     1
...                                                 ...   ...
2995  The screen does get smudged easily because it ...     0
2996  What a piece of junk.. I lose more calls on th...     0
2997                       Item Does Not Match Picture.     0
2998  The only thing that disappoint me is the infra...     0
2999  You can not answer calls with the unit, never ...     0

[3000 rows x 2 columns]
                                                 Review Score
0                                       wow loved place     1
1                                            crust good     0
2                                tasty textur

In [94]:
# df.to_pickle("preprocessed_dataframe.pkl")

In [95]:
# Check if the pickle file exists
pickle_file = 'preprocessed_dataframe.pkl'

if os.path.exists(pickle_file):
    # Load the DataFrame from the pickle file
    df = pd.read_pickle(pickle_file)
else:
    # Importing data
    df_list = [text_to_df(file_path) for file_path in file_paths]
    df = pd.concat(df_list, ignore_index=True)
    df.columns = ['Review', 'Score']
    
    # Preprocessing
    df['Review'] = df['Review'].apply(preprocess_text)
    
    # Save the preprocessed DataFrame as a pickle file
    df.to_pickle(pickle_file)

print(df)

                                                 Review Score
0                                       wow loved place     1
1                                            crust good     0
2                                tasty texture wa nasty     0
3     stopped late may bank holiday rick steve recom...     1
4                        selection menu wa great prices     1
...                                                 ...   ...
2995       screen doe get smudged easily touch ear face     0
2996                         piece junk lose call phone     0
2997                             item doe match picture     0
2998               thing disappoint infra red port rida     0
2999                      answer call unit never worked     0

[3000 rows x 2 columns]


In [96]:
# import nltk
# nltk.download('all')

In [97]:

# # Tokenization
# from nltk.tokenize import word_tokenize
# 
# # Ensure NLTK tokenizers are downloaded
# nltk.download('punkt')
# 
# # Tokenize each review into a list of words
# df['Tokens'] = df['Review'].apply(word_tokenize)
 

In [98]:
df  

Unnamed: 0,Review,Score
0,wow loved place,1
1,crust good,0
2,tasty texture wa nasty,0
3,stopped late may bank holiday rick steve recom...,1
4,selection menu wa great prices,1
...,...,...
2995,screen doe get smudged easily touch ear face,0
2996,piece junk lose call phone,0
2997,item doe match picture,0
2998,thing disappoint infra red port rida,0


In [99]:
# glove_pretrained = gensim.downloader.load('glove-wiki-gigaword-300')


In [100]:
# import numpy as np
# 
# # Load the GloVe embeddings
# def load_glove_embeddings(file_path):
#     embeddings_index = {}
#     with open(file_path, encoding='utf-8') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             coefs = np.asarray(values[1:], dtype='float32')
#             embeddings_index[word] = coefs
#     return embeddings_index
# 
# glove_embeddings = load_glove_embeddings('glove.6B.300d.txt')
# 
# # Convert text to GloVe embeddings
# def get_glove_embedding(text, embeddings, embedding_dim=300):
#     words = text.split()
#     embedding = np.zeros(embedding_dim)
#     count = 0
#     for word in words:
#         if word in embeddings:
#             embedding += embeddings[word]
#             count += 1
#     if count > 0:
#         embedding /= count
#     return embedding
# 
# # Convert tokens to GloVe embeddings
# df['Embedding'] = df['Tokens'].apply(lambda tokens: get_glove_embedding(' '.join(tokens), glove_embeddings))
# X = np.vstack(df['Embedding'].values)
# y = df['Score'].astype(int).values  # Convert to a NumPy array



In [101]:
# Step 1: Prepare the data
X = df['Review']  # Feature: Sentences
y = df['Score']     # Target: Sentiment scores

# Step 2: Create a Bag-of-Words representation
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

In [102]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the model
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

# Train-test split
from sklearn.model_selection import train_test_split

# Separating training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)
y = df['Score'].astype(int).values

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_}")

# Evaluate on test set
best_rf = grid_search.best_estimator_
accuracy = best_rf.score(X_test, y_test)
print(f"Test Set Accuracy: {accuracy}")

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7858333333333333
Test Set Accuracy: 0.805
