In [11]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle


# 1. تحميل البيانات

df = pd.read_csv(r"C:\Users\ComputerWorld\Desktop\Al\data\Tweets.csv")
df.head()


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [12]:
# فقط نحتاج النص والتصنيف
df = df[['airline_sentiment', 'text']].dropna()
df = df.rename(columns={'airline_sentiment': 'sentiment', 'text': 'text'})

# ---------------------------
# 2. تنظيف النصوص
# ---------------------------
def clean_text(s):
    s = str(s)
    s = re.sub(r"http\S+", " ", s)       # إزالة الروابط
    s = re.sub(r"@\w+", " ", s)          # إزالة المنشن
    s = re.sub(r"[^A-Za-z0-9\s]", " ", s) # إزالة الرموز الخاصة
    s = re.sub(r"\s+", " ", s).strip()
    return s.lower()

df['text_clean'] = df['text'].apply(clean_text)

In [13]:
# 3. تقسيم البيانات
# ---------------------------
X = df['text_clean'].values
y = df['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# 4. تحويل النص إلى ميزات (TF-IDF)
# ---------------------------
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [16]:
# 5. تدريب Naive Bayes
# ---------------------------
clf = MultinomialNB(alpha=1.0)
clf.fit(X_train_vec, y_train)

# 6. التقييم
# ---------------------------
y_pred = clf.predict(X_test_vec)

print("✅ الدقة:", accuracy_score(y_test, y_pred))
print("\n📋 تقرير التصنيف:\n", classification_report(y_test, y_pred))
print("\n📊 مصفوفة الالتباس:\n", confusion_matrix(y_test, y_pred))


✅ الدقة: 0.7513661202185792

📋 تقرير التصنيف:
               precision    recall  f1-score   support

    negative       0.74      0.98      0.84      1836
     neutral       0.74      0.30      0.43       620
    positive       0.87      0.46      0.60       472

    accuracy                           0.75      2928
   macro avg       0.78      0.58      0.62      2928
weighted avg       0.76      0.75      0.72      2928


📊 مصفوفة الالتباس:
 [[1797   25   14]
 [ 416  185   19]
 [ 214   40  218]]


In [17]:
examples = [
    "I love this airline, they were so helpful!",
    "My flight was delayed and the staff was rude.",
    "The flight was okay, nothing special."
]
examples_clean = [clean_text(t) for t in examples]
examples_vec = vectorizer.transform(examples_clean)
preds = clf.predict(examples_vec)

print("\n🔮 تنبؤات أمثلة جديدة:")
for text, p in zip(examples, preds):
    print(f" - '{text}' => {p}")



🔮 تنبؤات أمثلة جديدة:
 - 'I love this airline, they were so helpful!' => negative
 - 'My flight was delayed and the staff was rude.' => negative
 - 'The flight was okay, nothing special.' => negative
