In [13]:
from google.colab import drive
import os

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pickle import dump
import itertools
import re
import pandas as pd


In [37]:
def clean_text_unicode(text):
    text = re.sub(r'[^\u0A80-\u0AFF\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [36]:
def load_data(csv_path):
    data = pd.read_csv(csv_path,encoding='utf-8')
    data['Gujarati'] = data['Gujarati'].apply(clean_text_unicode)
    return data

def preprocess_data(data, label_column='Label', text_column='Gujarati'):
    X = data[text_column]
    y = data[label_column]
    return X, y

In [35]:

def get_classifiers():
    return {
        'LR': LogisticRegression(max_iter=1000),
        'RF': RandomForestClassifier(n_jobs=-1),
        'MNB': MultinomialNB(),
        'SVM': LinearSVC(),
        'SGD': SGDClassifier(),
        'XGB': xgb.XGBClassifier(tree_method='hist', n_jobs=-1)
    }

def save_model(clf, model_name, vectorizer, fold_folder):
    os.makedirs(fold_folder, exist_ok=True)

    model_path = os.path.join(fold_folder, f'{model_name}_model.pkl')
    with open(model_path, 'wb') as model_file:
        dump(clf, model_file)

    with open(os.path.join(fold_folder, f'{model_name}_vectorizer.pkl'), 'wb') as file:
        dump(vectorizer, file)

    print(f"Model and vectorizer saved at: {fold_folder}")

def save_classification_report_and_confusion_matrix(y_true, y_pred, results_folder, model_name):
    os.makedirs(results_folder, exist_ok=True)


    report_path = os.path.join(results_folder, f'{model_name}_report.txt')
    class_report = classification_report(y_true, y_pred)
    with open(report_path, 'w') as file:
        file.write(class_report)


    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', linewidths=0.5)
    plt.title(f'Confusion Matrix for {model_name}')
    cm_path = os.path.join(results_folder, f'{model_name}_cm.png')
    plt.savefig(cm_path)
    plt.close()

def plot_class_distribution(y):
    plt.figure(figsize=(8,6))
    sns.countplot(x=y)
    plt.title('Class Distribution')
    plt.xlabel('Classes')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('/content/drive/MyDrive/AI_Project/Gujarati/EDA/class_distribution.png')
    plt.close()


In [34]:

def train_and_save_voting_models(X, y, base_classifiers):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


    combinations_3 = list(itertools.combinations(base_classifiers.items(), 3))
    combinations_5 = list(itertools.combinations(base_classifiers.items(), 5))

    final_results = []

    for combs, folder_prefix in [(combinations_3, '3m'), (combinations_5, '5m')]:

        for comb in combs:
            model_names = [name for name, _ in comb]
            models = [(name, clf) for name, clf in comb]

            model_name = "_".join(model_names)
            print(f"Training VotingClassifier: {model_name}")


            word_vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 1))
            char_vect = TfidfVectorizer(analyzer='char', ngram_range=(2, 5))

            feature_union = FeatureUnion([
                ('word_tfidf', word_vect),
                ('char_tfidf', char_vect)
            ])


            X_train_tfidf_combined = feature_union.fit_transform(X_train)
            X_test_tfidf_combined = feature_union.transform(X_test)


            rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
            X_res, y_res = rus.fit_resample(X_train_tfidf_combined, y_train)


            voting_clf = VotingClassifier(estimators=models, voting='hard', n_jobs=-1)
            voting_clf.fit(X_res, y_res)


            y_pred = voting_clf.predict(X_test_tfidf_combined)


            fold_folder = f"/content/drive/MyDrive/AI_Project/Gujarati/{folder_prefix}/{model_name}"
            results_folder = os.path.join(fold_folder, 'reports')
            save_model(voting_clf, model_name, feature_union, fold_folder)
            save_classification_report_and_confusion_matrix(y_test, y_pred, results_folder, model_name)


            precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
            final_results.append({
                'Feature': 'word_ngram+char_ngram',
                'Model': model_name,
                'Precision': precision,
                'Recall': recall,
                'Macro F1': f1
            })

    results_df = pd.DataFrame(final_results)
    results_df = results_df.sort_values(by='Macro F1', ascending=False)
    results_df.to_csv('/content/drive/MyDrive/AI_Project/Gujarati/voting_classifier_results.csv', index=False)
    print('Voting Classifier Results saved.')

In [33]:
def main():
    data_path = '/content/Gujarati_Query_Train.csv'
    data = load_data(data_path)

    X, y = preprocess_data(data)
    print(y.value_counts())

    os.makedirs('/content/drive/MyDrive/AI_Project/Gujarati/EDA', exist_ok=True)
    plot_class_distribution(y)

    classifiers = get_classifiers()

    train_and_save_voting_models(X, y, classifiers)


In [38]:
if __name__ == '__main__':
    main()

Label
6    18960
8    11614
5     4423
0     4210
1     2602
4     2039
3     2019
9     1503
7     1350
2     1280
Name: count, dtype: int64
Training VotingClassifier: LR_RF_MNB
Model and vectorizer saved at: /content/drive/MyDrive/AI_Project/Gujarati/3m/LR_RF_MNB
Training VotingClassifier: LR_RF_SVM
Model and vectorizer saved at: /content/drive/MyDrive/AI_Project/Gujarati/3m/LR_RF_SVM
Training VotingClassifier: LR_RF_SGD
Model and vectorizer saved at: /content/drive/MyDrive/AI_Project/Gujarati/3m/LR_RF_SGD
Training VotingClassifier: LR_RF_XGB
Model and vectorizer saved at: /content/drive/MyDrive/AI_Project/Gujarati/3m/LR_RF_XGB
Training VotingClassifier: LR_MNB_SVM
Model and vectorizer saved at: /content/drive/MyDrive/AI_Project/Gujarati/3m/LR_MNB_SVM
Training VotingClassifier: LR_MNB_SGD
Model and vectorizer saved at: /content/drive/MyDrive/AI_Project/Gujarati/3m/LR_MNB_SGD
Training VotingClassifier: LR_MNB_XGB
Model and vectorizer saved at: /content/drive/MyDrive/AI_Project/Gujarat