In [38]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/acm-new-project/problems_data.jsonl.txt


**Loading Data**

In [39]:
import pandas as pd
import json

data = []

with open("/kaggle/input/acm-new-project/problems_data.jsonl.txt", "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)
df.head()


Unnamed: 0,title,description,input_description,output_description,sample_io,problem_class,problem_score,url
0,Uuu,Unununium (Uuu) was the name of the chemical\n...,The input consists of one line with two intege...,The output consists of $M$ lines where the $i$...,"[{'input': '7 10', 'output': '1 2 2 3 1 3 3 4 ...",hard,9.7,https://open.kattis.com/problems/uuu
1,House Building,A number of eccentrics from central New York h...,"The input consists of $10$ test cases, which a...",Print $K$ lines with\n the positions of the...,"[{'input': '0 2 3 2 50 60 50 30 50 40', 'outpu...",hard,9.7,https://open.kattis.com/problems/husbygge
2,Mario or Luigi,Mario and Luigi are playing a game where they ...,,,"[{'input': '', 'output': ''}]",hard,9.6,https://open.kattis.com/problems/marioorluigi
3,The Wire Ghost,Žofka is bending a copper wire. She starts wit...,The first line contains two integers $L$ and $...,The output consists of a single line consistin...,"[{'input': '4 3 3 C 2 C 1 C', 'output': 'GHOST...",hard,9.6,https://open.kattis.com/problems/thewireghost
4,Barking Up The Wrong Tree,"Your dog Spot is let loose in the park. Well, ...",The first line of input consists of two intege...,Write a single line containing the length need...,"[{'input': '2 0 10 0 10 10', 'output': '14.14'...",hard,9.6,https://open.kattis.com/problems/barktree


**Verifying Dataset Shape**

In [40]:
df.shape


(4112, 8)

**Inspecting Column Names**

In [41]:
df.columns


Index(['title', 'description', 'input_description', 'output_description',
       'sample_io', 'problem_class', 'problem_score', 'url'],
      dtype='object')

**Selecting needed columns**

In [42]:
df = df[
    [
        "title",
        "description",
        "input_description",
        "output_description",
        "problem_class",
        "problem_score"
    ]
]


**Handling missing values**

In [43]:
text_columns = ["title", "description", "input_description", "output_description"]
df[text_columns] = df[text_columns].fillna("")


In [44]:
df.isnull().sum()


title                 0
description           0
input_description     0
output_description    0
problem_class         0
problem_score         0
dtype: int64

**Combining text fields**

In [45]:
df["combined_text"] = (
    df["title"] + " " +
    df["description"] + " " +
    df["input_description"] + " " +
    df["output_description"]
)


In [46]:
df["combined_text"].str.len().describe()


count    4112.000000
mean     1625.107490
std       756.724479
min        10.000000
25%      1114.000000
50%      1515.000000
75%      2001.500000
max      7582.000000
Name: combined_text, dtype: float64

**TF-IDF Vectorization**

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)

X = tfidf.fit_transform(df["combined_text"])


**Targets**

In [48]:
y_class = df["problem_class"]
y_score = df["problem_score"]


**Train-test split**

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
    X, y_class, y_score, test_size=0.2, random_state=42
)


**CLASSIFICATION MODEL**

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

clf_model = LogisticRegression(max_iter=1000)
clf_model.fit(X_train, y_class_train)

y_class_pred = clf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_class_test, y_class_pred))
print("Confusion Matrix:\n", confusion_matrix(y_class_test, y_class_pred))
print("\nClassification Report:\n", classification_report(y_class_test, y_class_pred))


Accuracy: 0.5054678007290401
Confusion Matrix:
 [[ 24  64  48]
 [  7 314 104]
 [ 16 168  78]]

Classification Report:
               precision    recall  f1-score   support

        easy       0.51      0.18      0.26       136
        hard       0.58      0.74      0.65       425
      medium       0.34      0.30      0.32       262

    accuracy                           0.51       823
   macro avg       0.47      0.40      0.41       823
weighted avg       0.49      0.51      0.48       823



**REGRESSION MODEL**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

reg_model = RandomForestRegressor(
    n_estimators=50,     
    random_state=42,
    n_jobs=-1           
)

reg_model.fit(X_train, y_score_train)

y_score_pred = reg_model.predict(X_test)

print("MAE:", mean_absolute_error(y_score_test, y_score_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_score_test, y_score_pred)))


MAE: 1.7303718104495747
RMSE: 2.0723962990733993


In [52]:
import joblib
import os

os.makedirs("models", exist_ok=True)

joblib.dump(tfidf, "models/tfidf_vectorizer.pkl")
joblib.dump(clf_model, "models/difficulty_classifier.pkl")
joblib.dump(reg_model, "models/difficulty_regressor.pkl")

os.listdir("models")


['difficulty_regressor.pkl',
 'difficulty_classifier.pkl',
 'tfidf_vectorizer.pkl']