In [63]:
import os
import json
import numpy as np
import pandas as pd
import pickle
import re
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
file_path="data/problems_data.jsonl"
data=[]
with open(file_path,"r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line.strip()))
print(f"Total data points: {len(data)}")
print("Sample data:", data[0].keys())
df=pd.DataFrame(data)
print("Data loaded successfully!")
df.head()

Total data points: 4112
Sample data: dict_keys(['title', 'description', 'input_description', 'output_description', 'sample_io', 'problem_class', 'problem_score', 'url'])
Data loaded successfully!


Unnamed: 0,title,description,input_description,output_description,sample_io,problem_class,problem_score,url
0,Uuu,Unununium (Uuu) was the name of the chemical\n...,The input consists of one line with two intege...,The output consists of $M$ lines where the $i$...,"[{'input': '7 10', 'output': '1 2 2 3 1 3 3 4 ...",hard,9.7,https://open.kattis.com/problems/uuu
1,House Building,A number of eccentrics from central New York h...,"The input consists of $10$ test cases, which a...",Print $K$ lines with\n the positions of the...,"[{'input': '0 2 3 2 50 60 50 30 50 40', 'outpu...",hard,9.7,https://open.kattis.com/problems/husbygge
2,Mario or Luigi,Mario and Luigi are playing a game where they ...,,,"[{'input': '', 'output': ''}]",hard,9.6,https://open.kattis.com/problems/marioorluigi
3,The Wire Ghost,Žofka is bending a copper wire. She starts wit...,The first line contains two integers $L$ and $...,The output consists of a single line consistin...,"[{'input': '4 3 3 C 2 C 1 C', 'output': 'GHOST...",hard,9.6,https://open.kattis.com/problems/thewireghost
4,Barking Up The Wrong Tree,"Your dog Spot is let loose in the park. Well, ...",The first line of input consists of two intege...,Write a single line containing the length need...,"[{'input': '2 0 10 0 10 10', 'output': '14.14'...",hard,9.6,https://open.kattis.com/problems/barktree


In [64]:
data[1]

{'title': 'House Building',
 'description': 'A number of eccentrics from central New York have decided\n    that they have had enough of modern society, and want to move\n    from there. Together they have bought a rectangular piece of\n    land far away, and will now settle there.\nThe land consists of $N \\times\n    M$ squares, and it is possible to build a maximum of one\n    house on a given square. Each square has value $a_{x,y}$ that describes how nice it\n    is, on a scale between $0$\n    and $100$.\nThe goal of the eccentrics is to get as far away as possible\n    from everyone else, including each other. The happiness an\n    eccentric experiences from building his house on square\n    $(x,y)$ is thus\n    $a_{x,y}\\cdot d$, where\n    $d$ is the smallest\n    distance to another person.\nOut of habit, the eccentrics use Manhattan\n    distance to measure this; $d$ is defined as $\\min |x - x_2| + |y - y_2|$ over all\n    other people’s squares $(x_2,\n    y_2)$.\nThe eccen

In [65]:
data[0].keys()

dict_keys(['title', 'description', 'input_description', 'output_description', 'sample_io', 'problem_class', 'problem_score', 'url'])

In [66]:
df=pd.DataFrame(data)
df.head()

Unnamed: 0,title,description,input_description,output_description,sample_io,problem_class,problem_score,url
0,Uuu,Unununium (Uuu) was the name of the chemical\n...,The input consists of one line with two intege...,The output consists of $M$ lines where the $i$...,"[{'input': '7 10', 'output': '1 2 2 3 1 3 3 4 ...",hard,9.7,https://open.kattis.com/problems/uuu
1,House Building,A number of eccentrics from central New York h...,"The input consists of $10$ test cases, which a...",Print $K$ lines with\n the positions of the...,"[{'input': '0 2 3 2 50 60 50 30 50 40', 'outpu...",hard,9.7,https://open.kattis.com/problems/husbygge
2,Mario or Luigi,Mario and Luigi are playing a game where they ...,,,"[{'input': '', 'output': ''}]",hard,9.6,https://open.kattis.com/problems/marioorluigi
3,The Wire Ghost,Žofka is bending a copper wire. She starts wit...,The first line contains two integers $L$ and $...,The output consists of a single line consistin...,"[{'input': '4 3 3 C 2 C 1 C', 'output': 'GHOST...",hard,9.6,https://open.kattis.com/problems/thewireghost
4,Barking Up The Wrong Tree,"Your dog Spot is let loose in the park. Well, ...",The first line of input consists of two intege...,Write a single line containing the length need...,"[{'input': '2 0 10 0 10 10', 'output': '14.14'...",hard,9.6,https://open.kattis.com/problems/barktree


In [84]:
print("=== DATA ANALYSIS ===")
print("Total samples: ",len(df))
print("Number of unique classes: ",df['problem_class'].nunique())
print("\nClass distribution:")
class_counts=df["problem_class"].value_counts()
print(class_counts)
print("\nClass proportions:")
print(df["problem_class"].value_counts(normalize=True).round(3))


=== DATA ANALYSIS ===
Total samples:  4112
Number of unique classes:  3

Class distribution:
problem_class
hard      1941
medium    1405
easy       766
Name: count, dtype: int64

Class proportions:
problem_class
hard      0.472
medium    0.342
easy      0.186
Name: proportion, dtype: float64


In [85]:
print("\nData types:\n",df.dtypes)
print("\nMissing values before cleaning:\n",df.isnull().sum())
df = df.fillna("")
print("\nMissing values after cleaning:\n",df.isnull().sum())


Data types:
 title                  object
description            object
input_description      object
output_description     object
sample_io              object
problem_class          object
problem_score         float64
url                    object
text                   object
dtype: object

Missing values before cleaning:
 title                 0
description           0
input_description     0
output_description    0
sample_io             0
problem_class         0
problem_score         0
url                   0
text                  0
dtype: int64

Missing values after cleaning:
 title                 0
description           0
input_description     0
output_description    0
sample_io             0
problem_class         0
problem_score         0
url                   0
text                  0
dtype: int64


In [86]:
# enhanced text pre processing such as removing white spaces 
def text_preprocessing(text):
    text=text.lower()
    text=re.sub(r'[^a-zA-Z\s]',' ', text)
    text=re.sub(r'\s+',' ',text).strip()
    words=text.split()
    words=[word for word in words if len(word) > 2]
    text=' '.join(words)
    return text

# main keywords related to competiive programming extracted
def extract_features(df_input):
    features_df=pd.DataFrame(index=df_input.index)
    features_df['text_length']=df_input['text'].str.len()
    features_df['word_count']=df_input['text'].str.split().str.len()
    features_df['avg_word_length']=features_df['text_length']/features_df['word_count']
    features_df['has_algorithm_keywords']=df_input['text'].str.contains(
        'algorithm|complexity|optimization|dynamic|recursive|greedy|divide|conquer', 
        case=False,na=False
    ).astype(int)
    features_df['has_data_structures']=df_input['text'].str.contains(
        'array|tree|graph|stack|queue|heap|linked|list|hash|map', 
        case=False,na=False
    ).astype(int)
    features_df['has_math_keywords'] = df_input['text'].str.contains(
        'matrix|probability|combinatorics|number|prime|fibonacci|factorial', 
        case=False,na=False
    ).astype(int)
    features_df=features_df.fillna(0)
    return features_df

In [87]:
df["text"]=(df["title"].astype(str)+" "+
            df["description"].astype(str)+" "+
            df["input_description"].astype(str)+" "+
            df["output_description"].astype(str)+" "+
            df["sample_io"].astype(str))
df["text"]=df["text"].apply(text_preprocessing)
add_features=extract_features(df)
print("Additional extracted features:\n",add_features)
display(df["text"])

Additional extracted features:
       text_length  word_count  avg_word_length  has_algorithm_keywords  \
0            1310         215         6.093023                       1   
1            1072         175         6.125714                       0   
2             965         160         6.031250                       0   
3            1121         186         6.026882                       0   
4            1775         307         5.781759                       0   
...           ...         ...              ...                     ...   
4107          366          55         6.654545                       0   
4108          136          17         8.000000                       0   
4109          201          24         8.375000                       0   
4110          439          77         5.701299                       0   
4111         1834         287         6.390244                       0   

      has_data_structures  has_math_keywords  
0                       1       

0       uuu unununium uuu was the name the chemical el...
1       house building number eccentrics from central ...
2       mario luigi mario and luigi are playing game w...
3       the wire ghost ofka bending copper wire she st...
4       barking the wrong tree your dog spot let loose...
                              ...                        
4107    lvunarfr ingar telja computer scientists count...
4108    velkomin welcome forritunarkeppni framhaldssk ...
4109    til hamingju there input this problem print si...
4110    hipp hipp there input this problem print lines...
4111    advanced causal measurements causality very im...
Name: text, Length: 4112, dtype: object

In [71]:
df["problem_class"].value_counts()

problem_class
hard      1941
medium    1405
easy       766
Name: count, dtype: int64

In [88]:
X=df["text"]
y_class=df["problem_class"]
y_score=df["problem_score"]
X_train, X_test, y_class_train, y_class_test=train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)
X_train_reg, X_test_reg, y_score_train, y_score_test = train_test_split(
    X, y_score, test_size=0.2, random_state=42
)
add_features_train = add_features.loc[X_train.index]
add_features_test = add_features.loc[X_test.index]
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print("Train class distribution:")
print(y_class_train.value_counts())


Training set size: 3289
Test set size: 823
Train class distribution:
problem_class
hard      1552
medium    1124
easy       613
Name: count, dtype: int64


In [91]:
vectorizer=TfidfVectorizer(
    max_features=10000,
    min_df=2,
    max_df=0.95,
    ngram_range=(1,2),
    stop_words="english",
    sublinear_tf=True
)
X_train_tfidf=vectorizer.fit_transform(X_train)
X_test_tfidf=vectorizer.transform(X_test)
scaler = StandardScaler()
add_train_scaled=scaler.fit_transform(add_features_train)
add_test_scaled=scaler.transform(add_features_test)
X_train_combined=hstack([X_train_tfidf, add_train_scaled])
X_test_combined=hstack([X_test_tfidf, add_test_scaled])
print(f"TF-IDF features shape: {X_train_tfidf.shape}")
print(f"Additional features shape: {add_train_scaled.shape}")
print(f"Combined features shape: {X_train_combined.shape}")


TF-IDF features shape: (3289, 10000)
Additional features shape: (3289, 6)
Combined features shape: (3289, 10006)


In [98]:
clf=LogisticRegression(max_iter=500)
clf.fit(X_train_combined,y_class_train)
y_class_pred=clf.predict(X_test_combined)
print("Accuracy: ",accuracy_score(y_class_test,y_class_pred))
print("\nClassification Report:\n",classification_report(y_class_test,y_class_pred))
print("\nConfusion Matrix:\n",confusion_matrix(y_class_test,y_class_pred))

Accuracy:  0.5151883353584447

Classification Report:
               precision    recall  f1-score   support

        easy       0.61      0.33      0.43       153
        hard       0.55      0.79      0.65       389
      medium       0.37      0.23      0.28       281

    accuracy                           0.52       823
   macro avg       0.51      0.45      0.45       823
weighted avg       0.50      0.52      0.48       823


Confusion Matrix:
 [[ 51  57  45]
 [ 13 308  68]
 [ 19 197  65]]


In [1]:
ridge=Ridge(alpha=1.0)
ridge.fit(X_train_combined, y_score_train)
y_score_pred=ridge.predict(X_test_combined)
mae=mean_absolute_error(y_score_test,y_score_pred)
rmse=np.sqrt(mean_squared_error(y_score_test,y_score_pred))
r2=r2_score(y_score_test,y_score_pred)
print("MAE:",mae)
print("RMSE:",rmse)
print("R2:",r2)



NameError: name 'Ridge' is not defined

In [102]:
pickle.dump(vectorizer, open("models/tfidf_vectorizer.pkl", "wb"))
pickle.dump(clf, open("models/probclass_model.pkl", "wb"))
pickle.dump(ridge, open("models/probscore_model.pkl", "wb"))
pickle.dump(scaler, open("models/feature_scaler.pkl","wb"))