In [2]:
import logging
import sys
import numpy as np
import pandas as pd
import os
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# ✅ Import the clean_text function from external module
from text_utils import clean_text


def load_data(filepath: str, question_col: str, label_col: str):
    """
    Load the data from an Excel file, clean the questions, and check for missing labels.
    """
    try:
        df = pd.read_excel(filepath)
        if question_col not in df.columns or label_col not in df.columns:
            raise ValueError(f"Columns '{question_col}' and '{label_col}' must be present.")
        df[question_col] = df[question_col].astype(str).apply(clean_text)
        if df[label_col].isnull().any():
            missing_count = df[label_col].isnull().sum()
            logging.warning("There are %d missing labels.", missing_count)
        else:
            logging.info("All questions have labels.")
        logging.info("Sample cleaned questions:\n%s", df[question_col].head().to_string())
        return df[question_col], df[label_col]
    except Exception as e:
        logging.error("Error reading the Excel file: %s", e)
        sys.exit(1)


def create_pipeline(model):
    """
    Create a pipeline that includes TF-IDF vectorization (with cleaning) and the classifier.
    """
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', preprocessor=clean_text)),
        ('clf', model)
    ])
    return pipeline


def evaluate_model(pipeline, X, y, cv=5) -> float:
    """
    Evaluate a pipeline using cross-validation and return the mean accuracy.
    """
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    return np.mean(scores)


def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    # Configuration
    filepath = r"C:\Users\Wangari Kimani\Downloads\sample questions.xlsx"
    question_col = 'Question'
    label_col = 'Question Type'

    logging.info("Loading data from %s", filepath)
    X, y = load_data(filepath, question_col, label_col)
    print("Data loaded successfully!")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Candidate models
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'SVM': LinearSVC(max_iter=1000),
        'Random Forest': RandomForestClassifier(n_estimators=100),
        'Naive Bayes': MultinomialNB()
    }

    base_results = {}
    logging.info("Starting base model evaluation using cross-validation.")
    for name, model in models.items():
        logging.info("Evaluating base model: %s", name)
        pipeline = create_pipeline(model)
        score = evaluate_model(pipeline, X_train, y_train)
        base_results[name] = score
        logging.info("Base %s Cross-Validation Accuracy: %.4f", name, score)

    print("Base model evaluation results:")
    for model_name, accuracy in base_results.items():
        print(f"{model_name}: {accuracy:.4f}")

    # Best model selection
    best_model_name = max(base_results, key=base_results.get)
    best_model = models[best_model_name]
    print("Best base model:", best_model_name)

    # Hyperparameter grid
    if best_model_name == 'Logistic Regression':
        param_grid = {'clf__C': [0.1, 1, 10]}
    elif best_model_name == 'SVM':
        param_grid = {'clf__C': [0.1, 1, 10]}
    elif best_model_name == 'Random Forest':
        param_grid = {'clf__n_estimators': [50, 100, 200]}
    elif best_model_name == 'Naive Bayes':
        param_grid = {'clf__alpha': [0.5, 1.0, 1.5]}
    else:
        param_grid = {}

    pipeline = create_pipeline(best_model)

    if param_grid:
        logging.info("Starting hyperparameter tuning for %s using GridSearchCV.", best_model_name)
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        tuned_score = grid_search.best_score_
        best_params = grid_search.best_params_
        logging.info("Best hyperparameters for %s: %s", best_model_name, best_params)
        print("Best hyperparameters:", best_params)
        print(f"CV Accuracy after tuning: {tuned_score:.4f}")
        best_pipeline = grid_search.best_estimator_
    else:
        best_pipeline = pipeline
        best_pipeline.fit(X_train, y_train)

    # Final test set evaluation
    y_pred = best_pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", test_accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    # Save the final model
    model_path = 'best_model.pkl'
    joblib.dump(best_pipeline, model_path)
    logging.info("Best model saved to %s", os.path.abspath(model_path))
    print("Best model saved to", os.path.abspath(model_path))


if __name__ == '__main__':
    main()

ModuleNotFoundError: No module named 'text_utils'