In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pickle
import pandas as pd

# Function to preprocess the CSV data
def preprocess_kaggle_data(file_path):
    # Specify the encoding to avoid UnicodeDecodeError
    df = pd.read_csv(file_path, encoding='latin1')  # Change 'latin1' to 'Windows-1252' if needed

    # Display initial rows to understand the structure
    print("Initial Dataset Preview:")
    print(df.head())

    # Rename columns for clarity (assume columns are 'QuestionText' and 'Difficulty')
    df = df.rename(columns={
        'Question': 'Question',  # Adjust as needed, 'Question' is likely correct
        'DifficultyFromQuestioner': 'Difficulty'  # Use the actual column name for difficulty
    })
    
    # Map textual difficulty levels to numeric labels
    difficulty_map = {'easy': 0, 'medium': 1, 'hard': 2}
    df['Difficulty'] = df['Difficulty'].map(difficulty_map)
    
    # Drop rows with missing values in either column
    df = df.dropna(subset=['Question', 'Difficulty'])

    # Filter out rows where 'Difficulty' mapping failed
    df = df[df['Difficulty'].notna()]

    # Display the shape and a preview of the cleaned dataset
    print("Cleaned Dataset Shape:", df.shape)
    print("Cleaned Dataset Preview:")
    print(df.head())

    return df

# Train the model
def train_model(csv_path, model_path):
    data = preprocess_kaggle_data(csv_path)
    print(data.shape)
    X_train, X_test, y_train, y_test = train_test_split(data['Question'], data['Difficulty'], test_size=0.2, random_state=42)
    print("Training set size:", len(X_train))
    print("Test set size:", len(X_test))
    
    # Create a pipeline for vectorization and classification
    model_pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer()),  # Using TfidfVectorizer here
        ('classifier', LogisticRegression())  # Using LogisticRegression here
    ])
    
    # Define hyperparameter grid
    param_grid = {
        'classifier__C': [0.1, 1, 10]  # Hyperparameter tuning for Logistic Regression
    }

    # Use GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(model_pipeline, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    print(f"Best Parameters: {grid_search.best_params_}")

    # Use the best model from grid search
    best_model = grid_search.best_estimator_

    # Evaluate the model
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")

    # Save the trained model
    with open(model_path, 'wb') as file:
        pickle.dump(best_model, file)

    print(f"Model saved to {model_path}")

# Run this script to train the model
if __name__ == "__main__":
    csv_file = "combined_question_answer_pairs2.csv"
    model_file = "question_classifier.pkl"
    train_model(csv_file, model_file)


Initial Dataset Preview:
       ArticleTitle                                           Question Answer  \
0  Alessandro_Volta                    Was Volta an Italian physicist?    yes   
1  Alessandro_Volta                    Was Volta an Italian physicist?    yes   
2  Alessandro_Volta         Is Volta buried in the city of Pittsburgh?     no   
3  Alessandro_Volta         Is Volta buried in the city of Pittsburgh?     no   
4  Alessandro_Volta  Did Volta have a passion for the study of elec...    yes   

  DifficultyFromQuestioner DifficultyFromAnswerer   ArticleFile Unnamed: 6  
0                     easy                   easy  S09_set4_a10        NaN  
1                     easy                   easy  S09_set4_a10        NaN  
2                     easy                   easy  S09_set4_a10        NaN  
3                     easy                   easy  S09_set4_a10        NaN  
4                     easy                 medium  S09_set4_a10        NaN  
Cleaned Dataset Shape: (30

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'classifier__C': 10}
Model Accuracy: 90.25%
Model saved to question_classifier.pkl


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
