In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv('/content/tweets.csv')
print(df.head())

                                             Column1  Column2
0  بعد استقالة رئيس #المحكمة_الدستورية ننتظر استق...      OBJ
1  أهنئ الدكتور أحمد جمال الدين، القيادي بحزب مصر...      POS
2  البرادعي يستقوى بامريكا مرةاخرى و يرسل عصام ال...      NEG
3  #الحرية_والعدالة | شاهد الآن: #ليلة_الاتحادية ...      OBJ
4  الوالدة لو اقولها بخاطري حشيشة تضحك بس من اقول...  NEUTRAL


In [3]:
def preprocess_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)  # Remove URLs
    tweet = re.sub(r'@\w+', '', tweet)  # Remove mentions
    tweet = re.sub(r'#\w+', '', tweet)  # Remove hashtags
    tweet = re.sub(r'\d+', '', tweet)  # Remove digits
    tweet = re.sub(r'\W+', ' ', tweet)  # Remove non-word characters
    return tweet

df['cleaned_tweet'] = df['Column1'].apply(preprocess_tweet)
print(df.head())

                                             Column1  Column2  \
0  بعد استقالة رئيس #المحكمة_الدستورية ننتظر استق...      OBJ   
1  أهنئ الدكتور أحمد جمال الدين، القيادي بحزب مصر...      POS   
2  البرادعي يستقوى بامريكا مرةاخرى و يرسل عصام ال...      NEG   
3  #الحرية_والعدالة | شاهد الآن: #ليلة_الاتحادية ...      OBJ   
4  الوالدة لو اقولها بخاطري حشيشة تضحك بس من اقول...  NEUTRAL   

                                       cleaned_tweet  
0                    بعد استقالة رئيس ننتظر استقالة   
1  أهنئ الدكتور أحمد جمال الدين القيادي بحزب مصر ...  
2  البرادعي يستقوى بامريكا مرةاخرى و يرسل عصام ال...  
3   شاهد الآن أول فيلم استقصائي يتناول أسرار و كو...  
4  الوالدة لو اقولها بخاطري حشيشة تضحك بس من اقول...  


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_tweet'], df['Column2'], test_size=0.2, random_state=42)

In [6]:
# Count Vectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

##ML-BASED

In [8]:
# Multinomial Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)
y_pred = nb_classifier.predict(X_test_vectorized)

In [9]:
# Results
print("Classification Report (ML-based):")
print(classification_report(y_test, y_pred))
print("Accuracy Score (ML-based):")
print(accuracy_score(y_test, y_pred))

Classification Report (ML-based):
              precision    recall  f1-score   support

         NEG       0.51      0.19      0.28       333
     NEUTRAL       0.35      0.04      0.06       170
         OBJ       0.69      0.96      0.80      1276
         POS       0.27      0.02      0.04       160

    accuracy                           0.67      1939
   macro avg       0.46      0.30      0.29      1939
weighted avg       0.59      0.67      0.58      1939

Accuracy Score (ML-based):
0.6689014956162971


In [10]:
# Count Vectorizer with N-grams
vectorizer = CountVectorizer(ngram_range=(1, 4))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
nb_classifier.fit(X_train_vectorized, y_train)
y_pred = nb_classifier.predict(X_test_vectorized)

print("Classification Report (ML-based with n-grams):")
print(classification_report(y_test, y_pred))
print("Accuracy Score (ML-based with n-grams):")
print(accuracy_score(y_test, y_pred))

Classification Report (ML-based with n-grams):
              precision    recall  f1-score   support

         NEG       0.63      0.11      0.19       333
     NEUTRAL       0.33      0.02      0.03       170
         OBJ       0.67      0.98      0.80      1276
         POS       0.38      0.02      0.04       160

    accuracy                           0.67      1939
   macro avg       0.50      0.28      0.26      1939
weighted avg       0.61      0.67      0.56      1939

Accuracy Score (ML-based with n-grams):
0.669417225373904


In [12]:
# TF-IDF Vectorizer with Trigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
nb_classifier = MultinomialNB(alpha=0.1)
nb_classifier.fit(X_train_vectorized, y_train)
y_pred = nb_classifier.predict(X_test_vectorized)

print("Classification Report (ML-based with TF-IDF and trigrams):")
print(classification_report(y_test, y_pred))
print("Accuracy Score (ML-based with TF-IDF and trigrams):")
print(accuracy_score(y_test, y_pred))

Classification Report (ML-based with TF-IDF and trigrams):
              precision    recall  f1-score   support

         NEG       0.46      0.25      0.32       333
     NEUTRAL       0.32      0.05      0.09       170
         OBJ       0.70      0.94      0.80      1276
         POS       0.43      0.06      0.11       160

    accuracy                           0.67      1939
   macro avg       0.48      0.33      0.33      1939
weighted avg       0.61      0.67      0.60      1939

Accuracy Score (ML-based with TF-IDF and trigrams):
0.6719958741619392


#Rule-Based#

In [15]:
def rule_based_sentiment_analysis(tweet):
  positive_keywords = [
    'طيب', 'سائر', 'فوز', 'طويل', 'حاضر', 'جيد', 'ممتاز', 'رائع', 'محبوب', 'مرحب',
    'مسرور', 'سعيد', 'مبهج', 'ناجح', 'مذهل', 'مثير', 'فخور', 'ملهم', 'جذاب', 'منعش'
]
  negative_keywords = [
    'كريه', 'غائب', 'قصير', 'خسارة', 'بضع', 'سيء', 'مؤلم', 'مزعج', 'خطير', 'ضار',
    'محبط', 'بغيض', 'مفجع', 'مرهق', 'مخيف', 'نكد', 'سخيف', 'فاشل', 'ثقيل', 'غاضب'
]

  tweet = preprocess_tweet(tweet)
  if any(word in tweet for word in positive_keywords):
        return 'POS'
  elif any(word in tweet for word in negative_keywords):
        return 'NEG'
  else:
        return 'NEUTRAL'

In [16]:
rule_based_predictions = [rule_based_sentiment_analysis(tweet) for tweet in X_test]
y_test_mapped = y_test.replace({'POSITIVE': 'POS', 'NEGATIVE': 'NEG'})
correct_predictions = sum(pred == true for pred, true in zip(rule_based_predictions, y_test_mapped))
accuracy = correct_predictions / len(y_test)
print(f"Rule-based Accuracy: {accuracy:.2%}")

Rule-based Accuracy: 10.01%


In [17]:
example_tweet = "#سيدنا_النبي كان طيب الرائحة، وإذا صافحه أحد ظل بيده عطره سائر اليوم صلى الله عليك يا سيدي يا رسول الله"
print(f"Sentiment (ML-based) of example tweet: {nb_classifier.predict(vectorizer.transform([preprocess_tweet(example_tweet)]))[0]}")
print(f"Sentiment (Rule-based) of example tweet: {rule_based_sentiment_analysis(example_tweet)}")

Sentiment (ML-based) of example tweet: POS
Sentiment (Rule-based) of example tweet: POS


In [19]:
import random

random_index = random.randint(0, len(df) - 1)
example_tweet = df.loc[random_index, 'cleaned_tweet']
print("Selected tweet for analysis:")
print(example_tweet)

ml_sentiment = nb_classifier.predict(vectorizer.transform([example_tweet]))[0]
rule_based_sentiment = rule_based_sentiment_analysis(example_tweet)

print(f"Sentiment (ML-based) of selected tweet: {ml_sentiment}")
print(f"Sentiment (Rule-based) of selected tweet: {rule_based_sentiment}")

Selected tweet for analysis:
 اللاعبين بعــد هـدف الاعجـوبة في مرمى فيـاريال 
Sentiment (ML-based) of selected tweet: OBJ
Sentiment (Rule-based) of selected tweet: NEUTRAL


#Knowledge-based#

In [25]:
sentiment_lexicon = {
    'جيد': 1, 'رائع': 2, 'ممتاز': 3,
    'سيء': -1, 'فظيع': -2, 'مروع': -3
}

In [26]:
def knowledge_based_sentiment_analysis(tweet):
    words = tweet.split()
    sentiment_score = 0
    found_words = 0

    for word in words:
        if word in sentiment_lexicon:
            sentiment_score += sentiment_lexicon[word]
            found_words += 1

    if found_words > 0:
        sentiment_score /= found_words

    if sentiment_score > 0:
        return 'POSITIVE'
    elif sentiment_score < 0:
        return 'NEGATIVE'
    else:
        return 'NEUTRAL'

In [27]:
# Apply the knowledge-based sentiment analysis to each cleaned tweet
df['kb_sentiment'] = df['cleaned_tweet'].apply(knowledge_based_sentiment_analysis)

# Display some of the results
print(df[['cleaned_tweet', 'Column2', 'kb_sentiment']].head())

                                       cleaned_tweet  Column2 kb_sentiment
0                    بعد استقالة رئيس ننتظر استقالة       OBJ      NEUTRAL
1  أهنئ الدكتور أحمد جمال الدين القيادي بحزب مصر ...      POS      NEUTRAL
2  البرادعي يستقوى بامريكا مرةاخرى و يرسل عصام ال...      NEG      NEUTRAL
3   شاهد الآن أول فيلم استقصائي يتناول أسرار و كو...      OBJ      NEUTRAL
4  الوالدة لو اقولها بخاطري حشيشة تضحك بس من اقول...  NEUTRAL      NEUTRAL


In [28]:
label_mapping = {
    'POS': 'POSITIVE',
    'NEG': 'NEGATIVE',
    'NEUTRAL': 'NEUTRAL',
    'OBJ': 'NEUTRAL'
}
df['mapped_labels'] = df['Column2'].map(label_mapping)

correct_predictions = (df['kb_sentiment'] == df['mapped_labels']).sum()
total_predictions = len(df)
accuracy = correct_predictions / total_predictions
print(f"Knowledge-Based Sentiment Analysis Accuracy: {accuracy:.2%}")

Knowledge-Based Sentiment Analysis Accuracy: 75.02%
