In [28]:
!pip install pandas
!pip install numpy
!pip install xgboost imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [29]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE


In [30]:
file_path = 'Dataset/raw_exam_data.csv'
df = pd.read_csv(file_path)
df

Unnamed: 0,Question_Text,Subject_Domain,Topic_Subdomain,Bloom_Taxonomy,Historical_Pass_Rate,Difficulty_Level
0,Outline Quantum Physics for a specific case.,Physics,Quantum Physics,Apply,63.2,Medium
1,Define Mechanics according to basic principles?,Physics,Mechanics,Remember,90.3,Easy
2,Outline Optics in a standard system.,Physics,Optics,Apply,63.4,Medium
3,Describe Networking in a standard system.,Computer Science,Networking,Apply,53.7,Medium
4,Derive the formula for Probability considering...,Mathematics,Probability,Evaluate,17.3,Hard
...,...,...,...,...,...,...
6195,Outline Relativity in a standard system.,Physics,Relativity,Apply,68.9,Medium
6196,Propose a solution for Statistics with mathema...,Mathematics,Statistics,Evaluate,59.5,Medium
6197,Which of these is Optics,Physics,Optics,Remember,86.9,Easy
6198,Identify Database briefly?,Computer Science,Database,Remember,75.4,Easy


In [31]:

import re


df = df.dropna(subset=['Question_Text', 'Difficulty_Level'])


def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<[^>]+>', '', text) 
    text = re.sub(r'[^\w\s]', '', text) 
    return text

df['Question_Text'] = df['Question_Text'].apply(clean_text)



final_df = df[['Question_Text', 'Difficulty_Level']].drop_duplicates()


final_df = final_df[final_df['Question_Text'].str.strip() != '']

print(f"Dataframe shape after preprocessing: {final_df.shape}")
final_df.head()


Dataframe shape after preprocessing: (1996, 2)


Unnamed: 0,Question_Text,Difficulty_Level
0,outline quantum physics for a specific case,Medium
1,define mechanics according to basic principles,Easy
2,outline optics in a standard system,Medium
3,describe networking in a standard system,Medium
4,derive the formula for probability considering...,Hard


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler, LabelEncoder
from scipy.sparse import hstack

final_df['Word_Count'] = final_df['Question_Text'].apply(lambda x: len(x.split()))
final_df['Char_Length'] = final_df['Question_Text'].apply(lambda x: len(x))

X_text = final_df['Question_Text']
X_num = final_df[['Word_Count', 'Char_Length']]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(final_df['Difficulty_Level'])

scaler = MaxAbsScaler()
X_num_scaled = scaler.fit_transform(X_num)

vectorizer = TfidfVectorizer(max_features=2500, stop_words='english', ngram_range=(1, 2))
X_text_tfidf = vectorizer.fit_transform(X_text)

X = hstack([X_text_tfidf, X_num_scaled])

print("Feature Matrix X shape:", X.shape)
print("Target y shape:", y.shape)
print("Class mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


Feature Matrix X shape: (1996, 539)
Target y shape: (1996,)
Class mapping: {'Easy': np.int64(0), 'Hard': np.int64(1), 'Medium': np.int64(2)}


In [33]:
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

best_model = None
best_acc = 0
best_model_name = ""

for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"\n--- {name} ---")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    
    if acc > best_acc:
        best_acc = acc
        best_model = model
        best_model_name = name

print(f"\nüèÜ Best Model automatically selected: {best_model_name} with Accuracy = {best_acc:.4f}")

os.makedirs('artifacts', exist_ok=True)
joblib.dump(vectorizer, 'artifacts/vectorizer.pkl')
joblib.dump(scaler, 'artifacts/scaler.pkl')
joblib.dump(label_encoder, 'artifacts/label_encoder.pkl')
joblib.dump(best_model, 'artifacts/best_model.pkl')
print("‚úÖ Successfully exported vectorizer, scaler, encoder, and Best Model to the 'artifacts/' folder.")



--- Logistic Regression ---
Accuracy: 0.7100
              precision    recall  f1-score   support

        Easy       0.81      0.80      0.80       120
        Hard       0.64      0.82      0.72       109
      Medium       0.69      0.58      0.63       171

    accuracy                           0.71       400
   macro avg       0.71      0.73      0.72       400
weighted avg       0.71      0.71      0.71       400


--- Random Forest ---
Accuracy: 0.5175
              precision    recall  f1-score   support

        Easy       0.66      0.69      0.68       120
        Hard       0.48      0.47      0.47       109
      Medium       0.43      0.43      0.43       171

    accuracy                           0.52       400
   macro avg       0.53      0.53      0.53       400
weighted avg       0.51      0.52      0.52       400


--- XGBoost ---
Accuracy: 0.5450
              precision    recall  f1-score   support

        Easy       0.68      0.73      0.71       120
        H

In [36]:
%%writefile app.py
import streamlit as st
import joblib
import pandas as pd
import numpy as np
import re
from scipy.sparse import hstack
import os

st.set_page_config(
    page_title="Intelligent Exam Question Analyzer", 
    page_icon="üéì", 
    layout="centered"
)

st.markdown("""
<style>
    /* Styling the main container */
    .block-container {
        padding-top: 3rem;
        padding-bottom: 3rem;
    }
    
    /* Input Text Area Styling */
    .stTextArea textarea {
        background-color: #ffffff !important;
        color: #1f2937 !important;
        border-radius: 12px;
        border: 2px solid #e0e0e0;
        box-shadow: 0 4px 6px rgba(0,0,0,0.05);
        font-size: 16px;
        padding: 15px;
        transition: all 0.3s ease;
                cursor: text;
        caret-color: #1f2937 !important;

    }
    .stTextArea textarea::placeholder {
        color: #a0aec0 !important;
    }
    .stTextArea textarea:focus {
        border-color: #4CAF50 !important;
        box-shadow: 0 4px 12px rgba(76, 175, 80, 0.2) !important;
    }
    
    /* Button Styling */
    .stButton>button {
        width: 100%;
        border-radius: 12px;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
        color: white !important;
        font-weight: 600;
        font-size: 18px;
        padding: 12px 24px;
        border: none;
        transition: all 0.3s ease;
        box-shadow: 0 4px 15px rgba(0,0,0,0.1);
    }
    .stButton>button:hover {
        transform: translateY(-2px);
        box-shadow: 0 6px 20px rgba(0,0,0,0.15);
    }
    .stButton>button:active {
        transform: translateY(0);
    }
    
    /* Result Widget Styling */
    .result-widget {
        background: #ffffff !important;
        padding: 30px;
        border-radius: 20px;
        box-shadow: 0 10px 30px rgba(0,0,0,0.08);
        text-align: center;
        margin-top: 30px;
        margin-bottom: 20px;
        animation: fadeIn 0.5s ease-out;
    }
    
    @keyframes fadeIn {
        from { opacity: 0; transform: translateY(10px); }
        to { opacity: 1; transform: translateY(0); }
    }
    
    /* Difficulty Colors */
    .diff-Easy { color: #10b981 !important; }
    .diff-Medium { color: #f59e0b !important; }
    .diff-Hard { color: #ef4444 !important; }
    
    /* Fix for text colors in the white widget */
    .result-widget p {
        color: #718096 !important; 
    }
    
    /* Subheaders */
    h1 {
        font-weight: 800;
        text-align: center;
        margin-bottom: 5px;
    }
    .subtitle {
        text-align: center;
        color: #a0aec0 !important;
        font-size: 18px;
        margin-bottom: 40px;
    }
</style>
""", unsafe_allow_html=True)

@st.cache_resource
def load_artifacts():
    try:
        vectorizer = joblib.load('artifacts/vectorizer.pkl')
        scaler = joblib.load('artifacts/scaler.pkl')
        label_encoder = joblib.load('artifacts/label_encoder.pkl')
        best_model = joblib.load('artifacts/best_model.pkl')
        return vectorizer, scaler, label_encoder, best_model
    except Exception as e:
        st.error(f"Error loading models: {str(e)}")
        return None, None, None, None

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<[^>]+>', '', text) 
    text = re.sub(r'[^\w\s]', '', text) 
    return text

st.markdown("<h1>üéì Question Difficulty Analyzer</h1>", unsafe_allow_html=True)
st.markdown("<p class='subtitle'>Powered by Classical Machine Learning</p>", unsafe_allow_html=True)

vectorizer, scaler, label_encoder, best_model = load_artifacts()

if best_model is None:
    st.error("Model artifacts not found. Please ensure that the 'artifacts' folder exists with vectorizer.pkl, scaler.pkl, label_encoder.pkl, and best_model.pkl.")
else:
    question = st.text_area(
        "", 
        height=160, 
        placeholder="Type or paste your exam question here...\n\ne.g. Synthesize asymptotic complexity with mathematical proofs."
    )
    
    if st.button("Analyze Question"):
        if not question.strip():
            st.warning("‚ö†Ô∏è Please enter a question to analyze.")
        else:
            with st.spinner("Analyzing complexity and patterns..."):
                cleaned_text = clean_text(question)
                word_count = len(cleaned_text.split())
                char_length = len(cleaned_text)
                
                X_text_tfidf = vectorizer.transform([cleaned_text])
                X_num_scaled = scaler.transform([[word_count, char_length]])
                X_final = hstack([X_text_tfidf, X_num_scaled])
                
                pred_idx = best_model.predict(X_final)[0]
                difficulty = label_encoder.inverse_transform([pred_idx])[0]
                
                conf_text = ""
                if hasattr(best_model, "predict_proba"):
                    probs = best_model.predict_proba(X_final)[0]
                    confidence = np.max(probs) * 100
                    conf_text = f"Confidence Score: {confidence:.1f}%"
                
                emoji_map = {"Easy": "üü¢", "Medium": "üü†", "Hard": "üî¥"}
                emoji = emoji_map.get(difficulty, "‚ö™")
                
                st.markdown(f"""
                <div class="result-widget">
                    <p style="color: #718096; font-size: 14px; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 5px;">Predicted Difficulty</p>
                    <h2 class="diff-{difficulty}" style="font-size: 48px; margin: 10px 0;">{emoji} {difficulty}</h2>
                    <p style="color: #a0aec0; font-size: 16px; font-weight: 500;">{conf_text}</p>
                </div>
                """, unsafe_allow_html=True)
                
                st.markdown("### üìä Text Statistics")
                col1, col2 = st.columns(2)
                
                with col1:
                    st.metric("Total Words", word_count)
                
                with col2:
                    st.metric("Total Characters", char_length)


Overwriting app.py


In [None]:
!streamlit run app.py


[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://10.7.21.148:8501[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
2026-02-28 20:34:31.475 `label` got an empty value. This is discouraged for accessibility reasons and may be disallowed in the future by raising an exception. Please provide a non-empty label and hide it with label_visibility if needed.
Stack (most recent call last):
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/threading.py", line 930, in _bootstrap
    self._bootstrap_inner()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/threading.py", line 973, in _bootstrap_inner
    self.run()
  File "/Library/Developer/CommandLineTools/Library/Framewor