<a href="https://colab.research.google.com/github/HussainBadreddeen/AutoML_Thesis/blob/main/hard_dataset_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Importing libraries needed and importing the dataset from google drive

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()
# from google.colab import drive
# drive.mount('/content/drive')


file_path = "/content/drive/MyDrive/thesisdata/balanced-reviews.txt"

# Load as a DataFrame (change delimiter if needed)
df = pd.read_csv(file_path, sep="\t", encoding="utf-16")




## I follow the CRISP-DM process here.
I've already done business understanding since I've mentioned the goal and objectives of this project is to compare AutoML for arabic sentiment analysis with manual tuning of hyperparameters

## **Data Understanding**

In [None]:
#first few rows
print(df.head())
print("----------------------------")
print()
#Number of Rows and coloumns
print('Number of rows and columns in the data set:', df.shape)
print("----------------------------")
print()
#Data types of the dataset
df.info()
print("----------------------------")
print()

# Display summary statistics for numerical features
print(df.describe())
print("----------------------------")
print()
#quick check if there are any missing values
print("Missing Values:")
print(df.isnull().sum())
print("----------------------------")
print()

##since we know that we are trying to measure sentiment we focus on data related to reviews and their ratings

In [None]:
#Checking How many values are Unqiue from the total values for the Column rating
print("Amount of Values in the column:", df["rating"].count())
print("----------------------------")
unique_ratings = df['rating'].nunique()
print(f"Number of unique ratings: {unique_ratings}")
print("The unique values are: " ,df['rating'].unique())
#1,2,4,5 are the unique values as they represent either negative (1,2) or positive (4,5). 3 was removed in the balanced version of the dataset
print("----------------------------")

count_of_rating_1 = (df['rating'] == 1).sum()
print("Count of 'rating one':", count_of_rating_1, "                   Percentage of Total=", count_of_rating_1/df["rating"].count()*100,"%")
print("----------------------------")

count_of_rating_2 = (df['rating'] == 2).sum()
print("Count of 'rating two':", count_of_rating_2, "                   Percentage of Total=", count_of_rating_2/df["rating"].count()*100,"%")
print("----------------------------")

count_of_rating_4 = (df['rating'] == 4).sum()
print("Count of 'rating four':", count_of_rating_4, "                   Percentage of Total=", count_of_rating_4/df["rating"].count()*100,"%")
print("----------------------------")

count_of_rating_5 = (df['rating'] == 5).sum()
print("Count of 'rating five':", count_of_rating_5, "                   Percentage of Total=", count_of_rating_5/df["rating"].count()*100,"%")
print("----------------------------")

#We can plot the number of ratings for each in a small graph here
sns.countplot(x = 'rating', data = df)
#(1-2)= negative
#(4-5)= positive

## We group ratings (1-2) and (4,5) to show total count of negative and positive sentiment respectively

In [None]:
#we create a new data frame called sentiment to have only positive and negative classes so we can compare safely without skewing the original data set
df['Sentiment'] = df['rating'].apply(lambda rating : 'negative' if rating < 4 else 'positive')

#we then plot the total count of each sentiment
sns.countplot(x = 'Sentiment', data = df)

negative_sentiment = (df['Sentiment'] == 'negative').sum()
print("Count of 'negative sentiment':", negative_sentiment, "                   Percentage of Total=", negative_sentiment/df["Sentiment"].count()*100,"%")

positive_sentiment = (df['Sentiment'] == 'positive').sum()
print("Count of 'positive sentiment':", positive_sentiment, "                   Percentage of Total=", positive_sentiment/df["Sentiment"].count()*100,"%")

#Data set appears to be prefectly balanced

##since we've covered the ratings we move on to the reviews

In [None]:
!pip install arabic-reshaper
!pip install python-bidi

from collections import Counter
from arabic_reshaper import reshape
from bidi.algorithm import get_display

# Inspect the review column
print(df['review'].describe())  # Basic statistics (count, unique, top, freq)
print("\nSample Reviews:\n", df['review'].sample(5))  # Print random 5 reviews



!pip install emoji
import re
import emoji

# Function to check if a review contains English letters
def contains_english(text):
    return bool(re.search(r'[a-zA-Z]', text))

# Function to check if a review contains emojis
def contains_emoji(text):
    return any(char in emoji.EMOJI_DATA for char in text)

# Apply functions to create new columns
df["has_english"] = df["review"].apply(contains_english)
df["has_emoji"] = df["review"].apply(contains_emoji)

# Check if a review has both English & emoji
df["has_both"] = df["has_english"] & df["has_emoji"]

num_english = df["has_english"].sum()
num_emoji = df["has_emoji"].sum()
num_both = df["has_both"].sum()

print(f"Reviews with English: {num_english}")
print(f"Reviews with Emojis: {num_emoji}")
print(f"Reviews with Both English & Emojis: {num_both}")


print("\nSample Reviews with English:")
print(df[df["has_english"]]["review"].sample(5).tolist())

print("\nSample Reviews with Emojis:")
print(df[df["has_emoji"]]["review"].sample(5).tolist())

print("\nSample Reviews with Both English & Emojis:")
print(df[df["has_both"]]["review"].sample(5).tolist())





# Combine all reviews into a single string
all_reviews = ' '.join(df['review'].astype(str))

# Tokenization (splitting into words)
words = all_reviews.split()

# Count word frequencies
word_counts = Counter(words)

# Get the top 30 most frequent words
top_30_words = word_counts.most_common(30)

# Prepare Arabic words for visualization
words, counts = zip(*top_30_words)
reshaped_words = [reshape(word) for word in words]
display_words = [get_display(word) for word in reshaped_words]

# Plot word frequencies
plt.figure(figsize=(12, 6))
plt.bar(display_words, counts)
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Top 30 Most Frequent Words in Reviews")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()


# import seaborn as sns

# # Compute review lengths
# df['review_length'] = df['review'].astype(str).apply(len)

# # Plot distribution
# plt.figure(figsize=(10, 5))
# sns.histplot(df['review_length'], bins=30, kde=True)
# plt.xlabel("Review Length (Characters)")
# plt.ylabel("Frequency")
# plt.title("Distribution of Review Lengths")
# plt.show()



# # Compute review length in words and characters
# df['word_count'] = df['review'].astype(str).apply(lambda x: len(x.split()))
# df['char_count'] = df['review'].astype(str).apply(len)

# # Plot word count distribution
# plt.figure(figsize=(10, 5))
# sns.histplot(df['word_count'], bins=30, kde=True)
# plt.xlabel("Number of Words in Review")
# plt.ylabel("Frequency")
# plt.title("Distribution of Word Count in Reviews")
# plt.show()



since we've inspected our main 2 attributes rating and review. we move on to data preperation where we drop other columns and start preprocessing the reviews

##**Data preperation**

In [None]:
df_copy = df[['rating', 'review']].copy()

##to better see reviews
pd.set_option('display.max_colwidth', None)  # Prevents text shrinking
pd.set_option('display.width', 1000)  # Adjusts display width


df_copy.head()

##We start with Normalizing the arabic text

since english represents only 2.56% we keep it as is bec. tf-idf and automl tools can still
recognise it without bias
emojis are converted because they hold sentiment

In [None]:
#To normalize arabic text we need to removing diacritics (tashkeel), removing elongation of words (tatweel) converting variations of the same certain letter to a standard unified form
#couldnt use farasa so opted for camel tools

# !pip install farasa # cant normalise with it :( docs arent available and cant sign up
!pip install camel-tools
import re
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.utils.dediac import dediac_ar


punctuation_pattern = re.compile(r"[-،؟.!\"':;(){}“”‘’,.&+\^\*\%@#/~=_\[\]<>|\\\n\t]")# Remove Arabic & English punctuation
quotes_pattern = re.compile(r'[\"\'“”‘’]')  # Matches only quotation marks (Arabic & English)


def remove_elongation(text):
    # Rule 1: Remove if a letter is repeated 3+ times anywhere
    text = re.sub(r'(.)\1{2,}', r'\1', text)

    # Rule 2: Remove if a letter is repeated 2+ times at the end of the word
    text = re.sub(r'(\w)\1$', r'\1', text)

    return text


# Dictionary to store emoji conversions
emoji_conversion_log = {}

def convert_emojis_to_arabic(text):
    converted_text = emoji.demojize(text, language='ar')  # 😍 → ":وجه_بعيون_على_شكل_قلوب:"
    converted_text_cleaned = converted_text.replace(":", "").replace("_", " ")  # :"وجه بعيون على شكل قلوب"

    # Log changes if an emoji was actually converted
    if text != converted_text:
        emoji_conversion_log[text] = converted_text_cleaned

    return converted_text_cleaned


def preprocess_text(text):
    text = normalize_unicode(text)  # Step 1: Normalize Unicode
    text = remove_elongation(text)  # Step 2: Remove elongation ## removes momtaz and other normal arabic words a workaround could be 3+ letters but will try to find a library first
    text = re.sub(r'[٠-٩]', lambda x: str(ord(x.group()) - ord('٠')), text)  # Step 3: Convert Arabic numbers to English
    text = convert_emojis_to_arabic(text)
    # text = re.sub(r'\d+', '', text)  # Step 4: Remove all numbers ##for now i wont reemove cause
    text = quotes_pattern.sub('', text)  # Step 5: Remove quotation marks (but keep text inside)
    text = dediac_ar(text)  # Step 6: Remove diacritics
    text = normalize_alef_maksura_ar(text)  # Step 7: Normalize ى → ي
    text = normalize_teh_marbuta_ar(text)  # Step 8: Normalize ة → ه
    text = normalize_alef_ar(text) # step 9  Normalize alef variants to 'ا'
    text = punctuation_pattern.sub('', text)  # Step 10: Remove punctuation
    # text = re.sub(r'(?<!\w)و(?=\w)', r'و ', text)  # Add space after و only if it's at the start
    # text = re.sub(r'(?<=\w)و(?!\w)', r' و', text)  # Add space before و only if it's at the end # Ensure "و" is separated only when it's at the beginning of a word
    text = re.sub(r'\s+', ' ', text).strip()

    return text


print()
print("Before camel tools and manual normalization")
print(df_copy['review'].head(20))
print("----------------------------")
print()

#Normalization applied to entire DataFrame
df['normalized_review'] = df['review'].astype(str).apply(preprocess_text)


# Function to check if text still contains emojis
def contains_emoji(text):
    return any(char in emoji.EMOJI_DATA for char in text)

#Check for emojis in the normalized reviews
df["has_emoji"] = df["normalized_review"].apply(contains_emoji)

#num of reviews with emojis
num_reviews_with_emojis = df["has_emoji"].sum()
print(f"\n Num of reviews containing emojis AFTER normalization: {num_reviews_with_emojis}")


#sample of emoji conversions
print("\n sample of Emoji Conversions (First 10)")
for original, converted in list(emoji_conversion_log.items())[:10]:
    print(f"{original} → {converted}")

print("After camel tools and manual normalization")
print(df['normalized_review'].head(20))
print("----------------------------")
print()

## After normalizing text we do Tokenization

In [None]:
from camel_tools.tokenizers.word import simple_word_tokenize

# Tokenize the normalized reviews
df['tokens'] = df['normalized_review'].apply(lambda x: simple_word_tokenize(x))

# Display the first few rows to verify tokenization
print(df[['normalized_review', 'tokens']].head(20))


## **We then remove stop words**

In [None]:
# Mounting Google Drive again
# from google.colab import drive
# drive.mount('/content/drive')

# Specify the path to your stopwords file
stopwords_file_path = "/content/drive/MyDrive/thesisdata/updated_stopwords.txt"

# Load stopwords as a set
df_stopwords = set(pd.read_csv(stopwords_file_path, header=None, encoding="utf-8")[0].tolist())

# Applying stopword removal
df["filtered_tokens"] = df["tokens"].apply(lambda tokens: [word for word in tokens if word not in df_stopwords])


from collections import Counter

# Get all removed stopwords
removed_words = []
for original, filtered in zip(df["tokens"], df["filtered_tokens"]):
    removed_words.extend([word for word in original if word not in filtered])

# Count removed words
removed_counts = Counter(removed_words)

# Print top 200 most removed words
print("Most removed stopwords:", removed_counts.most_common(200))


# Display results
print("Tokens before filtering stop words")
print(df["tokens"].head(10))
print("---------------------\n")

print("Tokens after filtering stop words")
print(df["filtered_tokens"].head(10))
print("----------------------\n")



##After tokenization and removing stop words we whould now have a cleaned dataset. we can now proceed with lemmatization to bring words back to their root form

In [None]:
# !pip install stanza
# import stanza
# stanza.download("ar")  # This downloads the Arabic NLP model


# #all needed cause stanza analyzes text by looking at the whole sentence to decide on the correct lemma so it group the sentence together decides then does tokenization again
# nlp = stanza.Pipeline(lang="ar", processors="tokenize,mwt,pos,lemma")

# #Lemmatize Tokenized Words
# def lemmatize_tokens(tokens):
#     text = " ".join(tokens)  # Convert list of tokens to a single text string to judge first
#     doc = nlp(text)
#     return [word.lemma for sent in doc.sentences for word in sent.words]

# #applying lemmatization
# df["lemmatized_tokens"] = df["filtered_tokens"].apply(lemmatize_tokens)

# #sample results
# print(df[["filtered_tokens", "lemmatized_tokens"]].head(10))

# #copy of lemmatized reviews
# df.to_csv("lemmatized_reviews.csv", encoding="utf-16", index=False)



##After lemmatization we can now proceed to word embeddings so that ML models understand words. here we use **TF-IDF**

In [None]:
# print("before lemmatization")
# print(df['filtered_tokens'].head(30))
# print("----------------------------")
# print()


# print("after lemmatization")
# df['lemmatized_tokens'].head(30)

In [None]:
# import pandas as pd
# import random
# from collections import Counter
# from nltk.stem.isri import ISRIStemmer
# from nltk.stem.snowball import ArabicStemmer

# # ✅ Initialize Stemmers
# isri_stemmer = ISRIStemmer()
# snowball_stemmer = ArabicStemmer()

# # ✅ Apply Stemming to Each Token List
# df["isri_stemmed_tokens"] = df["filtered_tokens"].apply(lambda tokens: [isri_stemmer.stem(word) for word in tokens])
# df["snowball_stemmed_tokens"] = df["filtered_tokens"].apply(lambda tokens: [snowball_stemmer.stem(word) for word in tokens])

# # ✅ Function to Evaluate Stemming Effectiveness
# def evaluate_stemming(df, original_column, stemmed_column, stemmer_name):
#     original_tokens = df[original_column].explode().dropna()
#     stemmed_tokens = df[stemmed_column].explode().dropna()

#     # 📌 Count Unique Words Before & After
#     unique_original = set(original_tokens)
#     unique_stemmed = set(stemmed_tokens)

#     print(f"🔹 {stemmer_name} Stemmer Evaluation 🔹")
#     print(f"📌 Unique Words Before: {len(unique_original)}")
#     print(f"📌 Unique Words After Stemming: {len(unique_stemmed)}")

#     # 🔍 Count How Many Words Changed
#     changed_words = [o for o, s in zip(original_tokens, stemmed_tokens) if o != s]
#     print(f"🔍 Words That Changed: {len(set(changed_words))} / {len(unique_original)} ({(len(set(changed_words)) / len(unique_original)) * 100:.2f}%)")

#     # ⚠️ Check for Inconsistencies (same word, different stems)
#     stem_counts = Counter(stemmed_tokens)
#     inconsistent_words = {word for word, count in stem_counts.items() if count > 1}
#     print(f"⚠️ Inconsistently Stemmed Words: {len(inconsistent_words)}\n")

#     return changed_words, inconsistent_words

# # ✅ Run Evaluation for Both Stemmers
# changed_isri, inconsistent_isri = evaluate_stemming(df, "filtered_tokens", "isri_stemmed_tokens", "ISRI")
# changed_snowball, inconsistent_snowball = evaluate_stemming(df, "filtered_tokens", "snowball_stemmed_tokens", "Snowball")

# # ✅ Show Random 5 Samples Before & After Stemming
# samples = random.sample(range(len(df)), 5)
# print("🔍 Sample Stemming Comparison 🔍\n")
# for i in samples:
#     print(f"🔵 Original: {df['filtered_tokens'].iloc[i]}")
#     print(f"🟢 ISRI Stemmed: {df['isri_stemmed_tokens'].iloc[i]}")
#     print(f"🟣 Snowball Stemmed: {df['snowball_stemmed_tokens'].iloc[i]}")
#     print("-" * 100)


In [None]:
# ✅ Convert tokenized words into a string for each review
df["filtered_text"] = df["filtered_tokens"].apply(lambda tokens: " ".join(tokens))

# 🔍 Display a sample
print(df[["filtered_tokens", "filtered_text"]].head(5))

# ✅ Apply TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Initialize TF-IDF Vectorizer (Limit to top 15000 words)
tfidf_vectorizer = TfidfVectorizer(max_features=15000)

# Transform text into TF-IDF representation
X_tfidf = tfidf_vectorizer.fit_transform(df["filtered_text"])

# Convert TF-IDF to DataFrame
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# 🔍 Display shape of TF-IDF matrix
print(f"✅ TF-IDF Matrix Shape: {X_tfidf.shape}")
print(X_tfidf_df.head(5))

# ✅ Ensure df has 'rating' column before merging
df_copy = df[['rating']].copy()  # Keeping only relevant column

# ✅ Merge TF-IDF features with ratings
df_tfidf_final = pd.concat([df_copy, X_tfidf_df], axis=1)

# 🔍 Check the merged dataset
print(df_tfidf_final.head(5))
print(f"✅ Final dataset shape: {df_tfidf_final.shape}")

# ✅ Save TF-IDF Vectorizer and Features
import pickle

# Save the TF-IDF vectorizer for future use
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

# Save transformed TF-IDF features + ratings
df_tfidf_final.to_csv("tfidf_with_ratings.csv", index=False, encoding="utf-16")

print("✅ TF-IDF features with ratings saved successfully!")


##We now have our reviews in a numerical form and ready for modelling

In [None]:
# ✅ Ensure required libraries are installed
!pip install tpot scikit-learn pandas numpy

# ✅ Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
import pickle

# ✅ Load TF-IDF dataset (ensures it's available if running separately)
try:
    df_tfidf_final = pd.read_csv("tfidf_with_ratings.csv", encoding="utf-16")
    print("✅ TF-IDF dataset loaded successfully!")
except FileNotFoundError:
    print("❌ ERROR: TF-IDF dataset not found. Ensure Part 1 has been run.")

# ✅ Check dataset structure
print(df_tfidf_final.head())
print(f"Dataset Shape: {df_tfidf_final.shape}")

# ✅ Separate features (TF-IDF vectors) and target (rating)
X = df_tfidf_final.drop(columns=["rating"])  # Features
y = df_tfidf_final["rating"]  # Target labels

# ✅ Convert rating into a **binary sentiment** (Positive = 1, Negative = 0)
y = y.apply(lambda x: 1 if x >= 4 else 0)  # Change threshold if needed

# ✅ Split data into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training Size: {X_train.shape}, Testing Size: {X_test.shape}")

# ✅ Initialize TPOT Classifier
tpot = TPOTClassifier(
    generations=10,  # Increase for better optimization (Default: 5)
    population_size=50,  # More models per generation (Default: 20)
    verbosity=2,  # Show progress details
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# ✅ Train TPOT to find the best model
tpot.fit(X_train, y_train)

# ✅ Evaluate TPOT on test data
accuracy = tpot.score(X_test, y_test)
print(f"✅ TPOT Best Model Accuracy: {accuracy:.4f}")

# ✅ Export the best model as a Python script
tpot.export("best_tpot_pipeline.py")

# ✅ Save the trained best model for later use
with open("tpot_best_model.pkl", "wb") as f:
    pickle.dump(tpot.fitted_pipeline_, f)

print("✅ TPOT best model saved successfully!")

# ===========================
# 🔍 **Load & Use the Model**
# ===========================

# # Load the trained TPOT model
# with open("tpot_best_model.pkl", "rb") as f:
#     loaded_model = pickle.load(f)

# # Predict on test data
# predictions = loaded_model.predict(X_test)
# print(predictions[:10])  # Show sample predictions
