# Preprocessing

In [147]:
# # You may need to run those in your enviroment terminal.

# pip install numpy==1.24.3 pandas==1.5.3
# pip install spacy
# pip install "thinc<8.3.6"

In [148]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import spacy
import nltk
from nltk.corpus import wordnet
import random
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

nltk.download('wordnet')
nltk.download('omw-1.4')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rahma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Rahma\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Creating data frame of the data and assigning them labels

In [149]:
texts = []
labels = []

base_path = Path('review_polarity/txt_sentoken')
pos_path = base_path / 'pos'
neg_path = base_path / 'neg'

# Assign label 1
if pos_path.exists():
    for file in pos_path.glob('*.txt'):
        with open(file, 'r', encoding = 'utf-8') as f:
            texts.append(f.read())
            labels.append(1)

# Assign label 0
if neg_path.exists():
    for file in neg_path.glob('*.txt'):
        with open(file, 'r', encoding = 'utf-8') as f:
            texts.append(f.read())
            labels.append(0)

df = pd.DataFrame({
    'label': labels,
    'text': texts
})

df.tail()

Unnamed: 0,label,text
1995,0,"if anything , "" stigmata "" should be taken as ..."
1996,0,"john boorman's "" zardoz "" is a goofy cinematic..."
1997,0,the kids in the hall are an acquired taste . \...
1998,0,there was a time when john carpenter was a gre...
1999,0,two party guys bob their heads to haddaway's d...


In [150]:
df[df.duplicated()]

Unnamed: 0,label,text


No duplicates were found

In [151]:
def spacy_pos_to_wordnet_pos(spacy_pos):
    if spacy_pos.startswith('N'):
        return wordnet.NOUN
    elif spacy_pos.startswith('V'):
        return wordnet.VERB
    elif spacy_pos.startswith('J'):
        return wordnet.ADJ
    elif spacy_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default fallback

In [152]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    synonyms.discard(word)      # Remove the original word to avoid replacement with itself
    return list(synonyms)

In [153]:
def augment_text_with_synonyms(tokens, pos_tags, synonym_probability = 0.2):
    augmented_tokens = []
    
    for token, pos_tag in zip(tokens, pos_tags):
        if random.random() < synonym_probability:
            if pos_tag in ['n', 'v', 'a']:   #nouns adjectives and verbs 
                synonyms = get_synonyms(token)
                if synonyms:
                    new_word = random.choice(synonyms)
                    augmented_tokens.append(new_word)
                    continue  
        augmented_tokens.append(token)         # Add the original word if no augmentation is done
    
    return augmented_tokens

In [154]:
def apply_augmentation_to_dataset(word_tokens, pos_tags, texts, labels, sentence_tokens, synonym_probability = 0.2):
    all_word_tokens = word_tokens.copy()
    all_texts = texts.copy()  # Original text is preserved
    all_labels = labels.copy()
    all_sentence_tokens = sentence_tokens.copy()
    all_pos_tags = pos_tags.copy()

    for tokens, pos, label, sentence, text in zip(word_tokens, pos_tags, labels, sentence_tokens, texts):
        augmented_tokens = augment_text_with_synonyms(tokens, pos, synonym_probability)

        all_word_tokens.append(augmented_tokens)
        all_texts.append(text)  # Keep the original text
        all_labels.append(label)
        all_pos_tags.append(pos)
        all_sentence_tokens.append(sentence)  # Sentence tokens are not augmented

    return all_word_tokens, all_pos_tags, all_texts, all_labels, all_sentence_tokens

In [155]:
def apply_lemmatization(tokens):
    return [token.lemma_ for token in nlp(' '.join(tokens))]

In [156]:
def apply_stemming(text):
    
    words = text.split()
    return ' '.join([PorterStemmer().stem(word) for word in words])

In [157]:
def visualize_tf_idf_heatmap(tfidf_matrix, feature_names, n_top_features = 20, n_top_docs = 10):
    # Get the top features by summing TF-IDF scores across documents
    tfidf_array = tfidf_matrix.toarray()
    feature_importance = np.sum(tfidf_array, axis = 0)
    top_feature_indices = np.argsort(feature_importance)[-n_top_features:][::-1]
    top_features = [feature_names[i] for i in top_feature_indices]
    
    # Get the top documents by summing TF-IDF scores across features
    doc_importance = np.sum(tfidf_array, axis=1)
    top_doc_indices = np.argsort(doc_importance)[-n_top_docs:][::-1]
    
    # Extract the submatrix for visualization
    sub_matrix = tfidf_array[np.ix_(top_doc_indices, top_feature_indices)]
    
    plt.figure(figsize = (12, 8))
    sns.heatmap(
        sub_matrix,
        annot = True,          # Show values in cells
        fmt = '.3f',           # Format with 3 decimal places
        cmap = 'YlGnBu',       # Better colormap
        xticklabels = top_features,
        yticklabels = range(n_top_docs)
    )
    plt.title(f'TF-IDF Heatmap (Top {n_top_features} Features, First {n_top_docs} Documents)')
    plt.xlabel('Words')
    plt.ylabel('Documents')
    plt.xticks(rotation = 45, ha = 'right')
    plt.tight_layout()
    plt.savefig('tfidf_heatmap.png')
    plt.close()
    
    # Bar chart of top features across the corpus
    plt.figure(figsize=(12, 6))
    top_features_scores = [feature_importance[i] for i in top_feature_indices]
    plt.bar(top_features, top_features_scores)
    plt.title('Top TF-IDF Features Across All Documents')
    plt.xlabel('Features')
    plt.ylabel('Sum of TF-IDF Scores')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('tfidf_top_features.png')
    plt.close()

In [158]:
texts = df['text'].tolist()
labels = df['label'].tolist()
word_tokens = []
sentence_tokens = []
pos_tags = []

for text in texts:
    doc = nlp(text)
    sentence_tokens.append([sent.text.strip() for sent in doc.sents])
    tokens = [token for token in doc if not token.is_punct and not token.is_stop and not token.is_space]
    word_tokens.append([token.text.lower() for token in tokens])
    pos_tags.append([spacy_pos_to_wordnet_pos(token.tag_) for token in tokens])

lemmatized_tokens = [apply_lemmatization(t) for t in word_tokens]

word_tokens, pos_tags, texts, labels, sentence_tokens = apply_augmentation_to_dataset(
    lemmatized_tokens, pos_tags, texts, labels, sentence_tokens
)

joined_texts = [' '.join(tokens) for tokens in word_tokens]

augmented_df = pd.DataFrame({
    'label': labels,
    'text': joined_texts,
    'word_tokens': word_tokens
})

augmented_df = augmented_df.sample(frac = 1, random_state = 42).reset_index(drop = True)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    augmented_df['text'], augmented_df['label'], test_size = 0.2, random_state = 42
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
vectorizer.fit(train_texts)
xtrain_tfidf = vectorizer.transform(train_texts)
xtest_tfidf = vectorizer.transform(test_texts)

tfidf_matrix = vectorizer.fit_transform(joined_texts)
feature_names = vectorizer.get_feature_names_out()

print(f"\nTF-IDF Matrix Shape: {tfidf_matrix.shape}")
print(f"Number of features: {len(feature_names)}")
visualize_tf_idf_heatmap(tfidf_matrix, feature_names)


TF-IDF Matrix Shape: (4000, 42977)
Number of features: 42977


In [159]:
print("augmented_df.head():")
display(augmented_df.head())

print("\ndf.head():")
display(df.head())

augmented_df.head():


Unnamed: 0,label,text,word_tokens
0,1,harmless silly fun comedy dim witted wrestling...,"[harmless, silly, fun, comedy, dim, witted, wr..."
1,0,point movie shit opera go completely wrong mem...,"[point, movie, shit, opera, go, completely, wr..."
2,1,sick life death bob flanagan supermasochist fe...,"[sick, life, death, bob, flanagan, supermasoch..."
3,0,everybody film believe alicia documentary see ...,"[everybody, film, believe, alicia, documentary..."
4,1,lisa cholodenko high art intelligent quiet pla...,"[lisa, cholodenko, high, art, intelligent, qui..."



df.head():


Unnamed: 0,label,text
0,1,films adapted from comic books have had plenty...
1,1,every now and then a movie comes along from a ...
2,1,you've got mail works alot better than it dese...
3,1,""" jaws "" is a rare film that grabs your atten..."
4,1,moviemaking is a lot like being the general ma...


In [160]:
print("augmented_df.tail():")
display(augmented_df.tail())

print("\ndf.tail():")
display(df.tail())

augmented_df.tail():


Unnamed: 0,label,text,word_tokens
3995,0,understand clich hell earth truly mean recentl...,"[understand, clich, hell, earth, truly, mean, ..."
3996,0,1954 japanese monster film godzilla transform ...,"[1954, japanese, monster, film, godzilla, tran..."
3997,1,verdict spine chill drama horror maestro steph...,"[verdict, spine, chill, drama, horror, maestro..."
3998,0,midway anaconda documentary filmmaker terri fl...,"[midway, anaconda, documentary, filmmaker, ter..."
3999,0,starship trooper bad movie mean bad movie cros...,"[starship, trooper, bad, movie, mean, bad, mov..."



df.tail():


Unnamed: 0,label,text
1995,0,"if anything , "" stigmata "" should be taken as ..."
1996,0,"john boorman's "" zardoz "" is a goofy cinematic..."
1997,0,the kids in the hall are an acquired taste . \...
1998,0,there was a time when john carpenter was a gre...
1999,0,two party guys bob their heads to haddaway's d...


In [161]:
print("\ndf dimentions:")
display(df.shape)

print("\naugmented_df dimentions:")
display(augmented_df.shape)


df dimentions:


(2000, 2)


augmented_df dimentions:


(4000, 3)

# Modelling - Classifiers

####        •    ML -> Logistic Regression, Naive Bayes, SVM, Decision tree, and Random Forest.

In [177]:
def train_model(classifier, feature_vector_train, train_labels, feature_vector_test, test_labels, is_neural_net=False):
    classifier.fit(feature_vector_train, train_labels)
    
    train_predictions = classifier.predict(feature_vector_train)
    test_predictions = classifier.predict(feature_vector_test)
    train_accuracy = metrics.accuracy_score(train_labels, train_predictions)
    test_accuracy = metrics.accuracy_score(test_labels, test_predictions)
    
    print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
    print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

#### 1. Logistic Regression

In [181]:
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 98.19%
Test Accuracy: 92.62%


#### 2. Naive Bayes

In [182]:
accuracy = train_model(naive_bayes.MultinomialNB(alpha = 0.0001), xtrain_tfidf, train_labels, xtest_tfidf,test_labels)

Train Accuracy: 99.88%
Test Accuracy: 93.38%


#### 3. SVM

In [183]:
accuracy = train_model(svm.SVC(kernel = 'linear', C = 1.0), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 99.59%
Test Accuracy: 95.62%


In [184]:
accuracy = train_model(svm.SVC(kernel = 'rbf', C = 1.0), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 99.94%
Test Accuracy: 96.62%


In [185]:
accuracy = train_model(svm.SVC(kernel = 'poly', C = 1.0, degree = 3), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 100.00%
Test Accuracy: 93.00%


#### 4. Decision Tree

In [186]:
accuracy = train_model(DecisionTreeClassifier(), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 100.00%
Test Accuracy: 79.38%


In [187]:
accuracy = train_model(RandomForestClassifier(), xtrain_tfidf, train_labels, xtest_tfidf, test_labels)

Train Accuracy: 100.00%
Test Accuracy: 94.12%


#### • DL -> BERT