# **LOADING DATASET**

In [1]:
import pandas as pd

# Load the dataset
file_path = 'Arabic_Reviews.tsv'
df = pd.read_csv(file_path, sep='\t')

# Display the first few rows of the dataset and summary information
print(df.head())
print(df.info())


      label                                               text  processed_text
0  Positive  ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...             NaN
1  Positive  أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...             NaN
2  Positive  هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...             NaN
3  Positive  خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...             NaN
4  Positive  ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...             NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519 entries, 0 to 518
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   label           519 non-null    object 
 1   text            519 non-null    object 
 2   processed_text  0 non-null      float64
dtypes: float64(1), object(2)
memory usage: 12.3+ KB
None


# **PREPROCESSING PHASE**

In [2]:
import re

# Define a basic list of Arabic stopwords manually
basic_arabic_stopwords = {
    'و', 'في', 'من', 'إلى', 'على', 'مع', 'عن', 'هذا', 'هذه', 'هو', 'هي',
    'ذلك', 'لكن', 'أن', 'إن', 'ما', 'كان', 'كنت', 'كل', 'لم', 'لن', 'لا',
    'إذ', 'إذا', 'أيضا', 'فقط', 'بعض', 'أي', 'أو', 'ولا', 'نحن', 'أنا',
    'كما', 'حين', 'حيث', 'هنا', 'هناك', 'لذلك', 'لأن', 'بعد', 'قبل', 'عند'
}

# Function to preprocess the text
def preprocess_text(text):
    # Remove punctuations, digits, and special characters
    text = re.sub(r'[\d\W]+', ' ', text)

    # Tokenize the text (split by whitespace)
    tokens = text.split()

    # Normalize text (Arabic specific)
    tokens = [re.sub(r'[إأآا]', 'ا', token) for token in tokens]
    tokens = [re.sub(r'ى', 'ي', token) for token in tokens]
    tokens = [re.sub(r'ؤ', 'ء', token) for token in tokens]
    tokens = [re.sub(r'ئ', 'ء', token) for token in tokens]
    tokens = [re.sub(r'ة', 'ه', token) for token in tokens]

    # Remove stop words
    tokens = [token for token in tokens if token not in basic_arabic_stopwords]

    return ' '.join(tokens)

# Apply preprocessing to the text column
df['processed_text'] = df['text'].apply(preprocess_text)

# Display the original and processed text for comparison
print(df[['text', 'processed_text']].head())


                                                text  \
0  ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...   
1  أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...   
2  هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...   
3  خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...   
4  ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...   

                                      processed_text  
0  ممتاز نوعا النظافه والموقع والتجهيز والشاطيء ا...  
1  احد اسباب نجاح الامارات ان شخص الدوله يعشق ترا...  
2  هادفه وقويه تنقلك صخب شوارع القاهره الي هدوء ج...  
3  خلصنا مبدءيا اللي مستني ابهار زي الفيل الازرق ...  
4  ياسات جلوريا جزء يتجزا دبي فندق متكامل الخدمات...  


# **APPLYING THE LEXICAL BASED APPROACH**

In [3]:
# Define a simple Arabic sentiment lexicon
positive_words = {'جيد', 'رائع', 'ممتاز', 'مدهش', 'جميل'}
negative_words = {'سيء', 'فظيع', 'ممل', 'سيئة', 'مزعج'}

# Function to calculate sentiment score based on the lexicon
def calculate_lexical_sentiment(text):
    # Tokenize the text
    tokens = text.split()

    # Calculate positive and negative scores
    positive_score = sum(1 for token in tokens if token in positive_words)
    negative_score = sum(1 for token in tokens if token in negative_words)

    # Determine sentiment
    if positive_score > negative_score:
        return 'Positive'
    elif negative_score > positive_score:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the lexical sentiment calculation to the processed text
df['lexical_sentiment'] = df['processed_text'].apply(calculate_lexical_sentiment)

# Display the original, processed text and the lexical sentiment for comparison
print(df[['text', 'processed_text', 'lexical_sentiment']].head())


                                                text  \
0  ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...   
1  أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...   
2  هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...   
3  خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...   
4  ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...   

                                      processed_text lexical_sentiment  
0  ممتاز نوعا النظافه والموقع والتجهيز والشاطيء ا...          Positive  
1  احد اسباب نجاح الامارات ان شخص الدوله يعشق ترا...           Neutral  
2  هادفه وقويه تنقلك صخب شوارع القاهره الي هدوء ج...           Neutral  
3  خلصنا مبدءيا اللي مستني ابهار زي الفيل الازرق ...           Neutral  
4  ياسات جلوريا جزء يتجزا دبي فندق متكامل الخدمات...           Neutral  


# **LABEL CONVERSION FROM STRING TO NUMERIC**

In [4]:
# Mapping from string labels to numeric labels
label_mapping = {'Positive': 1, 'Negative': 0}
df['numeric_label'] = df['label'].map(label_mapping)

# Display the first few rows with numeric labels
print(df[['label', 'numeric_label']].head())

      label  numeric_label
0  Positive            1.0
1  Positive            1.0
2  Positive            1.0
3  Positive            1.0
4  Positive            1.0


# **APPLYING THE NAIVE BASED MODEL**

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the processed text
X_tfidf = vectorizer.fit_transform(df['processed_text'])

# Convert labels to binary (e.g., Positive: 1, Negative: 0)
df['label_binary'] = df['label'].apply(lambda x: 1 if x == 'Positive' else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['label_binary'], test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = nb_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.625
Classification Report:
               precision    recall  f1-score   support

           0       0.62      1.00      0.76        63
           1       1.00      0.05      0.09        41

    accuracy                           0.62       104
   macro avg       0.81      0.52      0.43       104
weighted avg       0.77      0.62      0.50       104



# **APPLYING THE HYBRID LEXICAL-NAIVE BASED APPROACH**

In [6]:
# Function to combine lexical and Naive Bayes predictions
def combined_sentiment(lexical, nb_prediction):
    # Lexical sentiment (Positive, Negative, Neutral)
    if lexical == 'Positive':
        lexical_score = 1
    elif lexical == 'Negative':
        lexical_score = -1
    else:
        lexical_score = 0

    # Naive Bayes sentiment (predicted as 1 or 0)
    nb_score = nb_prediction * 2 - 1  # Convert 0 to -1 and 1 to 1

    # Combine scores (you can choose a different combination method)
    combined_score = (lexical_score + nb_score) / 2

    # Determine combined sentiment
    if combined_score > 0:
        return 'Positive'
    elif combined_score < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the combined sentiment function
df['combined_sentiment'] = df.apply(lambda row: combined_sentiment(row['lexical_sentiment'], nb_classifier.predict(vectorizer.transform([row['processed_text']]))[0]), axis=1)

# Display the combined sentiment results
print(df[['text', 'processed_text', 'lexical_sentiment', 'combined_sentiment']].head())


                                                text  \
0  ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...   
1  أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...   
2  هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...   
3  خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...   
4  ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...   

                                      processed_text lexical_sentiment  \
0  ممتاز نوعا النظافه والموقع والتجهيز والشاطيء ا...          Positive   
1  احد اسباب نجاح الامارات ان شخص الدوله يعشق ترا...           Neutral   
2  هادفه وقويه تنقلك صخب شوارع القاهره الي هدوء ج...           Neutral   
3  خلصنا مبدءيا اللي مستني ابهار زي الفيل الازرق ...           Neutral   
4  ياسات جلوريا جزء يتجزا دبي فندق متكامل الخدمات...           Neutral   

  combined_sentiment  
0            Neutral  
1           Negative  
2           Negative  
3           Negative  
4           Negative  


# **CALCULATING ACCURACIES AND COMPARISON**

In [14]:
from sklearn.metrics import accuracy_score, classification_report

# Convert the 'label' column to binary for comparison
df['label_binary'] = df['label'].apply(lambda x: 1 if x == 'Positive' else 0)

# Map lexical sentiment to binary for accuracy comparison
def lexical_to_binary(sentiment):
    if sentiment == 'Positive':
        return 1
    elif sentiment == 'Negative':
        return 0
    else:
        return None  # Neutral or undefined sentiments are not considered here

df['lexical_binary'] = df['lexical_sentiment'].apply(lexical_to_binary)

# Filter out rows where lexical sentiment is Neutral (None)
lexical_comparison_df = df.dropna(subset=['lexical_binary'])

# Calculate accuracy for the lexical approach
lexical_accuracy = accuracy_score(lexical_comparison_df['label_binary'], lexical_comparison_df['lexical_binary'])
print(f"Lexical Approach Accuracy: {lexical_accuracy:.2f}")

# Naive Bayes Classifier accuracy is already computed during model evaluation
nb_accuracy = accuracy_score(y_test, y_pred)
print(f"Naive Bayes Classifier Accuracy: {nb_accuracy:.2f}")

# Map combined sentiment to binary for accuracy comparison
def combined_to_binary(sentiment):
    if sentiment == 'Positive':
        return 1
    elif sentiment == 'Negative':
        return 0
    else:
        return None  # Neutral or undefined sentiments are not considered here

df['combined_binary'] = df['combined_sentiment'].apply(combined_to_binary)

# Filter out rows where combined sentiment is Neutral (None)
combined_comparison_df = df.dropna(subset=['combined_binary'])

# Calculate accuracy for the combined approach
combined_accuracy = accuracy_score(combined_comparison_df['label_binary'], combined_comparison_df['combined_binary'])
print(f"Combined Approach Accuracy: {combined_accuracy:.2f}")

# Display all accuracies together for comparison
print("\nAccuracy Comparison:")
print(f"Lexical Approach Accuracy: {lexical_accuracy:.2f}")
print(f"Naive Bayes Classifier Accuracy: {nb_accuracy:.2f}")
print(f"Combined Approach Accuracy: {combined_accuracy:.2f}")


Lexical Approach Accuracy: 0.62
Naive Bayes Classifier Accuracy: 0.62
Combined Approach Accuracy: 0.80

Accuracy Comparison:
Lexical Approach Accuracy: 0.62
Naive Bayes Classifier Accuracy: 0.62
Combined Approach Accuracy: 0.80
