In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('arabic_pos_ner_tagged.csv', encoding='utf-8')

In [3]:
MinimumArabicPercentage = 0.6

original_count = len(df)

df = df[df['arabic_ratio'] >= MinimumArabicPercentage]

filtered_count = len(df)

print(f"Original rows: {original_count:,}")
print(f"Rows after filtering: {filtered_count:,}")
print(f"Rows dropped: {original_count - filtered_count:,}")

Original rows: 10,000
Rows after filtering: 9,764
Rows dropped: 236


In [4]:
print("=== DATA INSPECTION ===")
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows of each column:")
for col in ['text_stem', 'char_3grams', 'char_4grams', 'pos_tagged']:
    print(f"\n{col}:")
    print(df[col].iloc[0][:100])  # Print first 100 chars
    print(f"Type: {type(df[col].iloc[0])}")
    print(f"Has NaN: {df[col].isna().sum()}")
    
print("\n=== EMPTY STRING CHECK ===")
for col in ['text_stem', 'char_3grams', 'char_4grams', 'pos_tagged', 'ner_tagged']:
    empty_count = (df[col].str.strip() == '').sum()
    print(f"{col} - Empty strings: {empty_count}")

=== DATA INSPECTION ===
Dataset shape: (9764, 7)

First few rows of each column:

text_stem:
نعل ريح ردي هذه نعل كثر فهي دفي ريح سعر عقل لجد ريع زوج ونا علي حد سوا لدي زوج ونح نحب
Type: <class 'str'>
Has NaN: 0

char_3grams:
الن لنع نعا عال ال  ل ا  ال الم لمر مري ريح يحه حه  ه     ا  ار ارت رتد تدي دي  ي ه  هذ هذه ذه  ه ا 
Type: <class 'str'>
Has NaN: 0

char_4grams:
النع لنعا نعال عال  ال ا ل ال  الم المر لمري مريح ريحه يحه  حه   ه  ا   ار  ارت ارتد رتدي تدي  دي ه 
Type: <class 'str'>
Has NaN: 0

pos_tagged:
النعال_NOUN|المريحة_ADJ|:_PUNCT|أرتدي_VERB|هذه_DET|النعال_NOUN|كثيرً_ADJ|ا_PART|!_PUNCT|ف_CCONJ|هي_P
Type: <class 'str'>
Has NaN: 0

=== EMPTY STRING CHECK ===
text_stem - Empty strings: 0
char_3grams - Empty strings: 0
char_4grams - Empty strings: 0
pos_tagged - Empty strings: 0
ner_tagged - Empty strings: 0


In [5]:
def whitespace_tokenizer(text):
    """Simple tokenizer that splits on whitespace"""
    if pd.isna(text) or text.strip() == '':
        return []
    return str(text).split()

def pipe_tokenizer(text):
    """Simple tokenizer that splits on the | character"""
    if pd.isna(text) or text.strip() == '':
        return []
    return str(text).split('|')

df['ner_tagged'] = df['ner_tagged'].fillna('')

features = ['text_stem', 'char_3grams', 'char_4grams', 'pos_tagged', 'ner_tagged']

X = df[features]
y = df['label']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


def add_len_feats(X):
    return X.assign(
        txt_len=X['text_stem'].fillna('').str.len(),
        word_cnt=X['text_stem'].fillna('').str.split().str.len()
    )

X_train = add_len_feats(X_train)
X_test = add_len_feats(X_test)

In [6]:
#  Best Model: LR
#  Best CV Score: 0.8504


preprocessor = ColumnTransformer(
    transformers=[
        # text_stem: Use whitespace_tokenizer and token_pattern=None
        ('stem', TfidfVectorizer(
            max_features=20000,
            ngram_range=(1, 2),
            min_df=2,
            strip_accents=None,
            lowercase=False,
            tokenizer=whitespace_tokenizer, 
            token_pattern=None             
        ), 'text_stem'),
        
        # char_3grams: Pre-tokenized, just split on whitespace
        ('tri', TfidfVectorizer(
            max_features=5000,
            tokenizer=whitespace_tokenizer,
            token_pattern=None,
            lowercase=False,
            min_df=1
        ), 'char_3grams'),
        
        # char_4grams: Pre-tokenized, just split on whitespace
        ('four', TfidfVectorizer(
            max_features=5000,
            tokenizer=whitespace_tokenizer,
            token_pattern=None,
            lowercase=False,
            min_df=1
        ), 'char_4grams'),
        
        # pos_tagged: Use pipe_tokenizer
        ('pos', TfidfVectorizer(
            max_features=8000,
            tokenizer=pipe_tokenizer,       
            token_pattern=None,
            lowercase=False,
            min_df=2
        ), 'pos_tagged'),
        ('ner', TfidfVectorizer(
            max_features=5000,  # Entities are rarer, so 5k is a good start
            tokenizer=pipe_tokenizer,       
            token_pattern=None,
            lowercase=False,
            min_df=2  # Filter rare/noisy entities
        ), 'ner_tagged'),
        
        # Length features: Scale the numerical features
        ('len_feats', StandardScaler(), ['txt_len', 'word_cnt'])
    ],
    remainder='drop'
)

In [7]:
print("\n=== TESTING PREPROCESSOR ===")
try:
    X_train_transformed = preprocessor.fit_transform(X_train, y_train)
    print(f"✓ Preprocessor works! Shape: {X_train_transformed.shape}")
    print(f"  Number of features: {X_train_transformed.shape[1]}")
except Exception as e:
    print(f"✗ Preprocessor failed: {e}")
    print("\nTrying to identify which transformer failed...")
    
    # Test each transformer individually
    for name, transformer, columns in preprocessor.transformers:
        if name == 'len_feats':
            continue
        try:
            if isinstance(columns, str):
                cols = [columns]
            else:
                cols = columns
            transformer.fit_transform(X_train[cols].fillna(''))
            print(f"  ✓ {name} works")
        except Exception as e:
            print(f"  ✗ {name} failed: {e}")
            # Show sample data
            print(f"    Sample data: {X_train[cols].iloc[0].values}")
    
    raise


=== TESTING PREPROCESSOR ===
✓ Preprocessor works! Shape: (7323, 39477)
  Number of features: 39477


In [8]:
models = {
    'LR': (
        LogisticRegression(max_iter=1000, random_state=42),
        {'model__C': [0.01, 0.1, 1.0, 10.0, 100.0]}
    ),
    'SVM': (
        SVC(random_state=42),
        {'model__C': [0.1, 1.0, 10.0], 'model__kernel': ['linear']}
    ),
    'KNN': (
        KNeighborsClassifier(),
        {'model__n_neighbors': [3,5,7],
         
        'model__weights': ['uniform', 'distance']}
    )
}

In [9]:
best_model = None
best_score = 0
results = {}

for name, (model, params) in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    print(f"{'='*60}")
    
    pipe = Pipeline(steps=[
        ('prep', preprocessor),
        ('model', model)
    ])
    
    start_time = time.time()
    
    try:
        clf = GridSearchCV(
            pipe, params,
            cv=3, 
            scoring='accuracy',
            n_jobs=-1,
            verbose=1
        )
        clf.fit(X_train, y_train)
        end_time = time.time()
        
        # Store results
        results[name] = {
            'best_score': clf.best_score_,
            'best_params': clf.best_params_,
            'training_time': end_time - start_time
        }
        
        print(f"\n{name} Results:")
        print(f"  Best CV Score: {clf.best_score_:.4f}")
        print(f"  Best Parameters: {clf.best_params_}")
        print(f"  Training Time: {end_time - start_time:.2f} seconds")
        
        # Test set evaluation
        y_pred = clf.predict(X_test)
        print(f"\n{name} Test Set Performance:")
        print(classification_report(y_test, y_pred))
        
        # Track best model
        if clf.best_score_ > best_score:
            best_score = clf.best_score_
            best_model = clf
            
    except Exception as e:
        print(f"✗ {name} failed: {e}")
        continue


Training LR...
Fitting 3 folds for each of 5 candidates, totalling 15 fits

LR Results:
  Best CV Score: 0.8596
  Best Parameters: {'model__C': 10.0}
  Training Time: 15.95 seconds

LR Test Set Performance:
              precision    recall  f1-score   support

           0       0.86      0.87      0.87      1257
           1       0.86      0.85      0.86      1184

    accuracy                           0.86      2441
   macro avg       0.86      0.86      0.86      2441
weighted avg       0.86      0.86      0.86      2441


Training SVM...
Fitting 3 folds for each of 3 candidates, totalling 9 fits

SVM Results:
  Best CV Score: 0.8518
  Best Parameters: {'model__C': 0.1, 'model__kernel': 'linear'}
  Training Time: 319.85 seconds

SVM Test Set Performance:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1257
           1       0.88      0.84      0.86      1184

    accuracy                           0.87      2441
   macro a

In [10]:
if best_model is not None:
    print(f"\n{'='*60}")
    print(f"Best Model: {[k for k, v in results.items() if v['best_score'] == best_score][0]}")
    print(f"Best Parameters: {best_model.best_params_}")
    print(f"Best CV Score: {best_score:.4f}")
    print(f"{'='*60}")
    
    # Save model and preprocessor
    joblib.dump(best_model, 'best_model.pkl')
    joblib.dump(best_model.best_estimator_.named_steps['prep'], 'vectorizer.pkl')
    
    print("\n✓ Model saved as 'best_model.pkl'")
    print("✓ Preprocessor saved as 'vectorizer.pkl'")
else:
    print("\n✗ No models trained successfully!")


Best Model: LR
Best Parameters: {'model__C': 10.0}
Best CV Score: 0.8596

✓ Model saved as 'best_model.pkl'
✓ Preprocessor saved as 'vectorizer.pkl'
