In [None]:
"""Gujarati Voting Classifier"""

from google.colab import drive
import os
import pandas as pd
import re
import itertools
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
from pickle import dump


drive.mount('/content/drive')


def clean_text_unicode(text):
    text = re.sub(r'[^\u0A80-\u0AFF\s]', '', str(text))  # Keep only Gujarati characters
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def load_data(csv_path):
    data = pd.read_csv(csv_path, encoding='utf-8')
    data['Gujarati'] = data['Gujarati'].apply(clean_text_unicode)
    return data

# -------------------------------
# Split features and labels
# -------------------------------
def preprocess_data(data, label_column='Label', text_column='Gujarati'):
    X = data[text_column]
    y = data[label_column]
    return X, y


def get_classifiers():
    return {
        'LR': LogisticRegression(),
        'RF': RandomForestClassifier(),
        'MNB': MultinomialNB(),
        'SVM': LinearSVC(),
        'SGD': SGDClassifier(),
        'XGB': xgb.XGBClassifier()
    }


def save_model(clf, model_name, vectorizer, fold_folder):
    os.makedirs(fold_folder, exist_ok=True)

    model_path = os.path.join(fold_folder, f'{model_name}_model.pkl')
    with open(model_path, 'wb') as model_file:
        dump(clf, model_file)

    vect_path = os.path.join(fold_folder, f'{model_name}_vectorizer.pkl')
    with open(vect_path, 'wb') as file:
        dump(vectorizer, file)

    print(f"Saved model and vectorizer to {fold_folder}")


def save_classification_report_and_confusion_matrix(y_true, y_pred, results_folder, model_name, class_names):
    os.makedirs(results_folder, exist_ok=True)

    report_path = os.path.join(results_folder, f'{model_name}_report.txt')
    with open(report_path, 'w') as file:
        file.write(classification_report(y_true, y_pred))

    cm = confusion_matrix(y_true, y_pred)


    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', linewidths=0.5, xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix: {model_name}')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.tight_layout()

    cm_path = os.path.join(results_folder, f'{model_name}_cm.png')
    plt.savefig(cm_path)
    plt.close()


def plot_class_distribution(y):
    plt.figure(figsize=(8,6))
    sns.countplot(x=y)
    plt.title('Class Distribution')
    plt.xlabel('Class Name')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('/content/drive/MyDrive/AI_Project/Gujarati/EDA/class_distribution.png')
    plt.close()


def train_and_save_voting_models(X, y, base_classifiers):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    combinations_3 = list(itertools.combinations(base_classifiers.items(), 3))
    combinations_5 = list(itertools.combinations(base_classifiers.items(), 5))

    final_results = []

    for combs, folder_prefix in [(combinations_3, '3m'), (combinations_5, '5m')]:
        for comb in combs:
            model_names = [name for name, _ in comb]
            models = [(name, clf) for name, clf in comb]
            model_name = "_".join(model_names)

            print(f"Training VotingClassifier: {model_name}")


            word_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,1))
            char_vect = TfidfVectorizer(analyzer='char', ngram_range=(2,5))
            feature_union = FeatureUnion([
                ('word_tfidf', word_vect),
                ('char_tfidf', char_vect)
            ])

            X_train_tfidf = feature_union.fit_transform(X_train)
            X_test_tfidf = feature_union.transform(X_test)


            rus = RandomUnderSampler(random_state=42)
            X_res, y_res = rus.fit_resample(X_train_tfidf, y_train)


            voting_clf = VotingClassifier(estimators=models, voting='hard', n_jobs=-1)
            voting_clf.fit(X_res, y_res)
            y_pred = voting_clf.predict(X_test_tfidf)


            fold_folder = f"/content/drive/MyDrive/AI_Project/Gujarati/{folder_prefix}/{model_name}"
            results_folder = os.path.join(fold_folder, 'reports')
            save_model(voting_clf, model_name, feature_union, fold_folder)


            class_names = y.unique()

            save_classification_report_and_confusion_matrix(y_test, y_pred, results_folder, model_name, class_names)


            micro_f1 = precision_recall_fscore_support(y_test, y_pred, average='micro')[2]
            final_results.append({
                'Model': model_name,
                'Feature': 'word(1gram)+char(2-5gram)',
                'Micro F1': micro_f1
            })

    results_df = pd.DataFrame(final_results)
    results_df = results_df.sort_values(by='Micro F1', ascending=False)
    results_df.to_csv('/content/drive/MyDrive/AI_Project/Gujarati/voting_classifier_results.csv', index=False)
    print("Voting classifier results saved.")


def main():
    data_path = '/content/Gujarati_Query_Train.csv'
    data = load_data(data_path)

    X, y = preprocess_data(data)
    print("Class distribution:")
    print(y.value_counts())

    os.makedirs('/content/drive/MyDrive/AI_Project/Gujarati/EDA', exist_ok=True)
    plot_class_distribution(y)

    classifiers = get_classifiers()
    train_and_save_voting_models(X, y, classifiers)

if __name__ == '__main__':
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Class distribution:
Label
Plant Protection                   18959
Weather                            11614
Nutrient Management                 4423
Cultural Practices                  4210
Fertilizer Use and Availability     2602
Market Information                  2039
Government Schemes                  2019
Weed Management                     1503
Varieties                           1350
Field Preparation                   1280
Name: count, dtype: int64
Training VotingClassifier: LR_RF_MNB
Saved model and vectorizer to /content/drive/MyDrive/AI_Project/Gujarati/3m/LR_RF_MNB
Training VotingClassifier: LR_RF_SVM
Saved model and vectorizer to /content/drive/MyDrive/AI_Project/Gujarati/3m/LR_RF_SVM
Training VotingClassifier: LR_RF_SGD
Saved model and vectorizer to /content/drive/MyDrive/AI_Project/Gujarati/3m/LR_RF_SGD
Training VotingClassifier: LR_RF_XGB
Sav