In [None]:
!gdown --id 1GH682t9d8UjKtusxRa6bl0V0lkQ0DsVV

Downloading...
From: https://drive.google.com/uc?id=1GH682t9d8UjKtusxRa6bl0V0lkQ0DsVV
To: /content/train.csv
100% 17.5M/17.5M [00:00<00:00, 50.9MB/s]


In [None]:
import pandas as pd
import datetime

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import randint
import time


In [None]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,text,dialect
0,حاطينهم فوق التلاجة ولا تحت الدولاب,LY
1,واقعة سيد عبد النعيم بعين طفل عمره سنة عمرو شوقا,EG
2,باقي ايام رفع الحظر وامريكا في المشيه والجيه ت...,SD
3,خالص ما مصدق عم يغني صراحة الارتب حماقي ما بعر...,LB
4,زعما الناس تقدر تطلع وتعتصم قدام بو زي ما دارو...,LY


In [None]:
train_df.dropna(inplace=True)

In [14]:
# Define the parameter grid for random search
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (1, 3), (1,4)],  # ngram parameter options
    'tfidf__max_features': randint(1000, 10000),  # max_features parameter options
}

# Create a separate pipeline for each machine learning model
logreg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LogisticRegression())
])

mnb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', MultinomialNB())
])


# Update the parameter grid for each pipeline
logreg_param_grid = param_grid.copy()
logreg_param_grid['model'] = [LogisticRegression()]
logreg_param_grid['model__C'] = [1,5,10,20,100]

mnb_param_grid = param_grid.copy()
mnb_param_grid['model'] = [MultinomialNB()]
mnb_param_grid['model__alpha'] = [0.1, 0.5, 1.0]

# Create a dictionary of pipelines and parameter grids
pipelines = {
    'logreg': (logreg_pipeline, logreg_param_grid),
    'mnb': (mnb_pipeline, mnb_param_grid),
}


In [15]:
# Run random search for each pipeline
for model_name, (pipeline, param_grid) in pipelines.items():
    print(model_name.center(50,'-'))
    start_time = time.time()
    # Create a RandomizedSearchCV instance
    random_search = RandomizedSearchCV(
        estimator=pipeline, 
        param_distributions=param_grid,
        n_iter=5,  # number of parameter settings that are sampled
        cv=3,  # number of cross-validation folds
        scoring='f1_micro',  # evaluation metric
        verbose=1,  # print progress messages
        n_jobs=-1  # use all available CPU cores
    )

    # Fit the RandomizedSearchCV instance on the text data
    random_search.fit(train_df['text'], train_df['dialect'])
    end_time = time.time()

    print(f'Best estimator for {model_name} = {random_search.best_estimator_}')
    print(f'Best score for {model_name} = {random_search.best_score_}')
    print(f'Time taken for {model_name} = {end_time - start_time:.2f} seconds')



----------------------logreg----------------------
Fitting 3 folds for each of 5 candidates, totalling 15 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best estimator for logreg = Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=9979, ngram_range=(1, 4))),
                ('model', LogisticRegression(C=1))])
Best score for logreg = 0.8043681156336074
Time taken for logreg = 308.54 seconds
-----------------------mnb------------------------
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best estimator for mnb = Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=8078, ngram_range=(1, 2))),
                ('model', MultinomialNB(alpha=0.5))])
Best score for mnb = 0.7924167558556879
Time taken for mnb = 170.50 seconds


In [16]:
# Create a separate pipeline for each machine learning model
best_model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LogisticRegression(C=5))
])


In [17]:
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))



In [18]:
X_train, X_test, y_train, y_test = train_test_split(train_df.text, train_df.dialect, test_size=0.1, random_state=42)

In [19]:
from sklearn import metrics
best_model_pipeline.fit(X_train, y_train)

print_report(best_model_pipeline, X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

          EG       0.83      0.92      0.87      5084
          LB       0.84      0.84      0.84      2432
          LY       0.80      0.82      0.81      3341
          MA       0.88      0.66      0.75      1068
          SD       0.85      0.59      0.70      1363

    accuracy                           0.83     13288
   macro avg       0.84      0.77      0.80     13288
weighted avg       0.83      0.83      0.82     13288

accuracy: 0.827


In [20]:
from sklearn.utils import shuffle
train_df = shuffle(train_df,random_state=42)

In [21]:
best_model_pipeline.fit(train_df.text, train_df.dialect);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
import joblib
joblib.dump(best_model_pipeline, 'TF-IDF-model.joblib')

In [None]:
from google.colab import drive
import shutil
drive.mount('/content/drive')
folder_dir = "/content/drive/MyDrive/Colab Notebooks/NLP/Final-Project/Models/"
shutil.copy2('TF-IDF-model.joblib', folder_dir)