In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler
import time
import ast

In [2]:
df = pd.read_csv('arabic_reviews_sequential_clean.csv',encoding = 'utf-8')

In [3]:
df.head()

Unnamed: 0,label,lemmatized_content,bigrams,trigrams,fourgrams
0,1,"['نعال', 'مريحه', 'رتد', 'نعال', 'هي', 'دافئ',...","['نعال_مريحه', 'مريحه_رتد', 'رتد_نعال', 'نعال_...","['نعال_مريحه_رتد', 'مريحه_رتد_نعال', 'رتد_نعال...","['نعال_مريحه_رتد_نعال', 'مريحه_رتد_نعال_هي', '..."
1,1,"['منتج', 'جميل', 'خدم', 'ئه', 'قد', 'اشتري', '...","['منتج_جميل', 'جميل_خدم', 'خدم_ئه', 'ئه_قد', '...","['منتج_جميل_خدم', 'جميل_خدم_ئه', 'خدم_ئه_قد', ...","['منتج_جميل_خدم_ئه', 'جميل_خدم_ئه_قد', 'خدم_ئه..."
2,1,"['جيد', 'اشياء', 'صغيره', 'عمل', 'شكل', 'جيد',...","['جيد_اشياء', 'اشياء_صغيره', 'صغيره_عمل', 'عمل...","['جيد_اشياء_صغيره', 'اشياء_صغيره_عمل', 'صغيره_...","['جيد_اشياء_صغيره_عمل', 'اشياء_صغيره_عمل_شكل',..."
3,0,"['هي', 'غايه', 'غايه', 'نت', 'شتر', 'حذر', 'جد...","['هي_غايه', 'غايه_غايه', 'غايه_نت', 'نت_شتر', ...","['هي_غايه_غايه', 'غايه_غايه_نت', 'غايه_نت_شتر'...","['هي_غايه_غايه_نت', 'غايه_غايه_نت_شتر', 'غايه_..."
4,1,"['اشخاص', 'حب', 'ضحك', 'قط', 'ان', 'بتسم', 'عن...","['اشخاص_حب', 'حب_ضحك', 'ضحك_قط', 'قط_ان', 'ان_...","['اشخاص_حب_ضحك', 'حب_ضحك_قط', 'ضحك_قط_ان', 'قط...","['اشخاص_حب_ضحك_قط', 'حب_ضحك_قط_ان', 'ضحك_قط_ان..."


In [4]:
list_columns = ['lemmatized_content', 'bigrams', 'trigrams', 'fourgrams']

# Use .apply() with ast.literal_eval to safely convert them back
for col in list_columns:
    df[col] = df[col].apply(ast.literal_eval)

In [5]:
df['bigrams'][0][0]

'نعال_مريحه'

In [6]:
features = ['lemmatized_content', 'bigrams', 'trigrams']

X = df[features]
y = df['label']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
def add_len_feats(X):
    # Apply 'len' directly to the list to get the word count
    word_cnt = X['lemmatized_content'].apply(len)
    
    # Join the list of words into a string, then get the string length
    txt_len = X['lemmatized_content'].apply(lambda x: len(" ".join(x)))
    
    return X.assign(
        txt_len=txt_len,
        word_cnt=word_cnt
    )

X_train = add_len_feats(X_train)
X_test = add_len_feats(X_test)

In [8]:
def whitespace_tokenizer(text):
    """Simple tokenizer that splits on whitespace"""
    if pd.isna(text) or text.strip() == '':
        return []
    return str(text).split()

def comma_tokenizer(text):
    """Simple tokenizer that splits on the | character"""
    if pd.isna(text) or text.strip() == '':
        return []
    return str(text).split(',')

In [9]:
# A simple tokenizer that returns the list of tokens it receives
def identity_tokenizer(tokens):
    return tokens

preprocessor = ColumnTransformer(
    transformers=[
        ('lem', TfidfVectorizer(max_features=30000,
            # ngram_range=(1, 2),  <--- REMOVED: No longer needed
            min_df=1,
            strip_accents=None,
            lowercase=False,
            tokenizer=identity_tokenizer,  # <--- FIXED: Use identity tokenizer
            token_pattern=None ), 'lemmatized_content'),
        
        ('bi', TfidfVectorizer(max_features=10000,
            tokenizer=identity_tokenizer,  # <--- FIXED: Use identity tokenizer
            token_pattern=None,
            lowercase=False,
            min_df=1), 'bigrams'),
        
        ('tri', TfidfVectorizer(max_features=10000,
            tokenizer=identity_tokenizer,  # <--- FIXED: Use identity tokenizer
            token_pattern=None,
            lowercase=False,
            min_df=1), 'trigrams'),
        
        ('len_feats', StandardScaler(), ['txt_len', 'word_cnt'])
    ],
    remainder='drop'
)

In [10]:
models = {
    'KNN': (KNeighborsClassifier(), {'model__n_neighbors': [3, 5, 7]}),
    'SVM': (SVC(), {'model__C': [0.1, 1, 10], 'model__kernel': ['linear', 'rbf']}),
    'LR': (LogisticRegression(max_iter=1000), {'model__C': [0.1, 1, 10]})
}

best_model = None
best_score = 0

In [11]:
for name, (model, params) in models.items():
    print(f"Training {name}...")
    pipe = Pipeline(steps=[
        ('prep', preprocessor),
        ('model', model)
    ])
    start_time = time.time()
    clf = GridSearchCV(pipe, params, cv=2, scoring='accuracy', n_jobs=-1)
    clf.fit(X_train, y_train)
    end_time = time.time()  # End timing
    
    print(f"{name} Best Score: {clf.best_score_}")
    print(f"{name} Training Time: {end_time - start_time:.2f} seconds")
    
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))

    if clf.best_score_ > best_score:
        best_score = clf.best_score_
        best_model = clf

Training KNN...
KNN Best Score: 0.5689608357280835
KNN Training Time: 6.25 seconds
              precision    recall  f1-score   support

           0       0.60      0.52      0.55      1013
           1       0.56      0.64      0.60       983

    accuracy                           0.58      1996
   macro avg       0.58      0.58      0.58      1996
weighted avg       0.58      0.58      0.58      1996

Training SVM...
SVM Best Score: 0.8275092844169923
SVM Training Time: 53.35 seconds
              precision    recall  f1-score   support

           0       0.84      0.85      0.85      1013
           1       0.84      0.84      0.84       983

    accuracy                           0.84      1996
   macro avg       0.84      0.84      0.84      1996
weighted avg       0.84      0.84      0.84      1996

Training LR...
LR Best Score: 0.8369042331719314
LR Training Time: 4.83 seconds
              precision    recall  f1-score   support

           0       0.85      0.85      0.85 

In [12]:
# Save best model and preprocessor
joblib.dump(best_model, 'best_model.pkl')
joblib.dump(best_model.best_estimator_.named_steps['prep'], 'vectorizer.pkl')

['vectorizer.pkl']