# Difficulty Level

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.exceptions import ConvergenceWarning
import warnings

In [2]:
# Load the dataset
df = pd.read_csv('../Data/Data.csv')

In [3]:
df.head()

Unnamed: 0,Questions,Difficulty Level,Bloom's Taxonomy,Category,Sub Category
0,how did serfdom develop in and then leave russia,Hard,Understanding,Description,Manner
1,what films featured the character popeye doyle,Medium,Understanding,Entity,"Currency, monetary value, or gem"
2,how can i find a list of celebrities real names,Hard,Understanding,Description,Manner
3,what fowl grabs the spotlight after the chines...,Medium,Understanding,Entity,Animal
4,what is the full form of com,Easy,Remembering,Abbreviation,Expression


In [4]:
# Split the data into training and testing sets
X = df['Questions']
y_difficulty = df['Difficulty Level']
X_train, X_test, y_difficulty_train, y_difficulty_test = train_test_split(X, y_difficulty, test_size=0.2, random_state=42)

In [5]:
# Define the pipeline for category prediction
difficulty_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LinearSVC(max_iter=10000)))  # Increase max_iter parameter
])

In [6]:
# Define the hyperparameters to tune
parameters = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
    'clf__estimator__C': [1, 10, 100]  # Penalty parameter C of the error term
}

In [7]:
# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(difficulty_pipeline, parameters, cv=5)
grid_search.fit(X_train, y_difficulty_train)

In [8]:
# Get the best performing model
best_difficulty_pipeline = grid_search.best_estimator_

In [9]:
# Print the best hyperparameters
print("Best Hyperparameters:")
print(grid_search.best_params_)

Best Hyperparameters:
{'clf__estimator__C': 1, 'tfidf__ngram_range': (1, 2)}


In [10]:
# Make predictions on the test set for categories
y_difficulty_pred = best_difficulty_pipeline.predict(X_test)

# Calculate accuracy for category prediction
difficulty_accuracy = accuracy_score(y_difficulty_test, y_difficulty_pred)
print("Accuracy for difficulty prediction: {:.2f}%".format(difficulty_accuracy * 100))

Accuracy for difficulty prediction: 92.39%


# Testing

In [11]:
# New question to predict
new_question = "Who painted the Mona Lisa?"

# Use the trained model to predict the category
predicted_difficulty = best_difficulty_pipeline.predict([new_question])

print("Predicted difficulty:", predicted_difficulty[0])

Predicted difficulty: Easy


In [12]:
import os
import pickle

# Define the path to the "Models" folder
models_folder = '../Models'

# Create the "Models" folder if it doesn't exist
if not os.path.exists(models_folder):
    os.makedirs(models_folder)

# Save the model to a file inside the "models" folder
model_filename = os.path.join(models_folder, 'Difficulty_model.pkl')
with open(model_filename, 'wb') as file:
    pickle.dump(best_difficulty_pipeline, file)