    # Import Important Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import joblib

# Upload Datasets

In [5]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
Train_df = pd.read_csv(r"C:\Users\alabdeen\Desktop\Ai-Book\Advanced NLP\Project_nlp\preprocessed_train_with_stopwords.csv")
Train_df.drop(columns='Unnamed: 0', inplace=True, errors='ignore')
Train_df

Unnamed: 0,text,label
0,زهقنا بقى من جو الخطب ده,EG
1,في روتين بتخاف يضل وفي روتين بتخاف يفل,LB
2,هه والله فكرني بيغ غي ادومة ديالنا مستيلي عليه,MA
3,بص زمان ايام البيجو السبع راكب محافظات واحد را...,EG
4,1 ممكن تبعتو رابيد عالحدود مع مصور وتنقلو الي ...,LB
...,...,...
118175,ميسي عليه غسل مواعين الغدا ورونالدو مواعين الع...,LY
118176,اليوم كل اصحابي لي احبهم غايبين عني,MA
118177,الحطب يبي ذهب ويبي 60 وقية حتي البطاطين يشكرو ...,LY
118178,ياريت يكون في دعم تبرع لدفاع المدني كل لازم يس...,LB


In [3]:
Test_df = pd.read_csv(r"C:\Users\alabdeen\Desktop\Ai-Book\Advanced NLP\Project_nlp\preprocessed_test_with_stopwords.csv")
Test_df.drop(columns='Unnamed: 0', inplace=True, errors='ignore')
Test_df

Unnamed: 0,text,label
0,مين دي يامرمر ماشاء الله,EG
1,ياجماعة في اللي يتلذذ بلجلد 09,LY
2,الحلقة اكيد حتكون ممتعة بس ياريت يا استاذة اسع...,MA
3,ابرا ودير عقل ياروحي,LY
4,اخرسي يا بوومة خربتي اليمن بجهلك وتآمرك,EG
...,...,...
29540,لا خلاص حتى نا بنسلم في التعليم ونمشيلهم هو حت...,LY
29541,كل سنه وانت طيب يا ابو قلب طيب,EG
29542,خلف السد شطبوا حضارة صور اخيرة لغرق احد القري ...,EG
29543,كان عزيز عليا قوس قزح دابا وليتي كنكرهو فاش كي...,MA


# TF-IDF

In [4]:
print(Train_df['text'].isna().sum())
print(Test_df['text'].isna().sum())

97
27


In [5]:
print("Rows with NaN values in Train_df:")
Train_df[Train_df['text'].isna()]

Rows with NaN values in Train_df:


Unnamed: 0,text,label
92,,EG
2287,,EG
3172,,MA
3353,,EG
6912,,EG
...,...,...
109103,,EG
109848,,EG
113544,,LB
115905,,MA


In [16]:
print("Rows with NaN values in Test_df:")
Test_df[Test_df['text'].isna()]

Rows with NaN values in Test_df:


Unnamed: 0,text,label
1004,,EG
1026,,EG
1189,,LY
1312,,EG
1793,,EG
2268,,LY
4752,,LY
5540,,EG
6643,,LB
7358,,EG


In [6]:
# Drop rows with NaN values in the 'text' column of Train_df
Train_df = Train_df.dropna(subset=['text'])

# Drop rows with NaN values in the 'text' column of Test_df
Test_df = Test_df.dropna(subset=['text'])

In [7]:
tfidf = TfidfVectorizer(use_idf=True)
X_train_counts = tfidf.fit_transform(Train_df['text'])
X_test_counts = tfidf.transform(Test_df['text'])

# ML Models

In [8]:
clf = LogisticRegression(random_state=42).fit(X_train_counts, Train_df['label'])
y_pred=clf.predict(X_test_counts)
print(confusion_matrix(Test_df['label'],y_pred))
print(classification_report(Test_df['label'],y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[10695   188   487    34   111]
 [  454  4599   395    26    44]
 [  831   272  6042    74    74]
 [  285   133   358  1475    55]
 [  665   158   398    26  1639]]
              precision    recall  f1-score   support

          EG       0.83      0.93      0.88     11515
          LB       0.86      0.83      0.85      5518
          LY       0.79      0.83      0.81      7293
          MA       0.90      0.64      0.75      2306
          SD       0.85      0.57      0.68      2886

    accuracy                           0.83     29518
   macro avg       0.85      0.76      0.79     29518
weighted avg       0.83      0.83      0.82     29518



In [9]:
clf_balance = LogisticRegression(random_state=42,class_weight='balanced',solver='newton-cg',C=10).fit(X_train_counts, Train_df['label'])
y_pred=clf_balance.predict(X_test_counts)
print(confusion_matrix(Test_df['label'],y_pred))
print(classification_report(Test_df['label'],y_pred))

[[10076   300   577   166   396]
 [  250  4764   293   104   107]
 [  551   307  5967   236   232]
 [  148   120   215  1732    91]
 [  356   144   260    64  2062]]
              precision    recall  f1-score   support

          EG       0.89      0.88      0.88     11515
          LB       0.85      0.86      0.85      5518
          LY       0.82      0.82      0.82      7293
          MA       0.75      0.75      0.75      2306
          SD       0.71      0.71      0.71      2886

    accuracy                           0.83     29518
   macro avg       0.80      0.80      0.80     29518
weighted avg       0.83      0.83      0.83     29518



# save the best model

In [27]:
joblib.dump(clf_balance, 'logistic_regression_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

# reload the model and test it

In [28]:
%%capture
!pip install tnkeeh

In [15]:
import tnkeeh as tn
import re

def predict_label(text):

    # text preprocessing
    cleander = tn.Tnkeeh(remove_diacritics=True,
                     remove_html_elements=True,
                     remove_twitter_meta=True,
                     remove_links=True,
                     remove_english=True,
                     remove_repeated_chars=True,
                     remove_long_words=True,
                     normalize=True
                     )

    text = cleander.clean_raw_text(text)
    text = text[0]

    text = text.replace(r'[0-9٠-٩]', '')
    text = text.replace("؟", "")
    text = text.replace("@", "")
    text = text.replace("_", "")
    text = text.replace("-", "")

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    arabic_punctuation_pattern = r'[^\w\s\u0621-\u063A\u0641-\u064A]'
    text = re.sub(arabic_punctuation_pattern,'',text)

    text = re.sub(r'\s+', ' ', text).strip()

    # Load the model and the vectorizer
    clf_balance = joblib.load('logistic_regression_model.pkl')
    tfidf = joblib.load('tfidf_vectorizer.pkl')

    # Transform the input text
    text_transformed = tfidf.transform([text])

    # Predict the label
    predicted_label = clf_balance.predict(text_transformed)

    return predicted_label[0]

# Example prediction
text1 = "بقولك"
predicted_label = predict_label(text1)
print(f"The predicted label for '{text1}' is: {predicted_label}")
text2 = "يازول"
predicted_label = predict_label(text2)
print(f"The predicted label for '{text2}' is: {predicted_label}")
text3 = "ياولد"
predicted_label = predict_label(text3)
print(f"The predicted label for '{text3}' is: {predicted_label}")

The predicted label for 'بقولك' is: EG
The predicted label for 'يازول' is: SD
The predicted label for 'ياولد' is: LY


In [15]:
import tnkeeh as tn
import re

def predict_label(text):

    # text preprocessing
    cleander = tn.Tnkeeh(remove_diacritics=True,
                     remove_html_elements=True,
                     remove_twitter_meta=True,
                     remove_links=True,
                     remove_english=True,
                     remove_repeated_chars=True,
                     remove_long_words=True,
                     normalize=True
                     )

    text = cleander.clean_raw_text(text)
    text = text[0]

    text = text.replace(r'[0-9٠-٩]', '')
    text = text.replace("؟", "")
    text = text.replace("@", "")
    text = text.replace("_", "")
    text = text.replace("-", "")

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    arabic_punctuation_pattern = r'[^\w\s\u0621-\u063A\u0641-\u064A]'
    text = re.sub(arabic_punctuation_pattern,'',text)

    text = re.sub(r'\s+', ' ', text).strip()

    # Load the model and the vectorizer
    clf_balance = joblib.load('logistic_regression_model.pkl')
    tfidf = joblib.load('tfidf_vectorizer.pkl')

    # Transform the input text
    text_transformed = tfidf.transform([text])

    # Predict the label
    predicted_label = clf_balance.predict(text_transformed)

    return predicted_label[0]

# Example prediction
text1 = "بقولك"
predicted_label = predict_label(text1)
print(f"The predicted label for '{text1}' is: {predicted_label}")
text2 = "يازول"
predicted_label = predict_label(text2)
print(f"The predicted label for '{text2}' is: {predicted_label}")
text3 = "ياولد"
predicted_label = predict_label(text3)
print(f"The predicted label for '{text3}' is: {predicted_label}")

The predicted label for 'بقولك' is: EG
The predicted label for 'يازول' is: SD
The predicted label for 'ياولد' is: LY
