In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.sparse import hstack
import joblib

In [4]:
df = pd.read_csv("/Smart_Essay_Scorer/data/processed/essays_featured.csv")
df.head()

Unnamed: 0,score,word_count,sent_count,word_count_capped,text_clean,words_per_sentence,char_count,avg_word_length
0,4,396,17,396,author suggests studying venus worthy dangerou...,23.294118,1331,3.361111
1,2,200,13,200,nasa fighting alble venus researching diffrent...,15.384615,516,2.58
2,3,371,31,371,evening star brightest point light sky night v...,11.967742,1221,3.291105
3,2,224,10,224,author support idea reading passage suggests v...,22.4,704,3.142857
4,2,219,7,219,author support idea state text strivivng meet ...,31.285714,776,3.543379


In [5]:
numeric_features = ['word_count', 'sent_count', 'word_count_capped', 'words_per_sentence', 'char_count', 'avg_word_length']
x_numeric = df[numeric_features].values
y = df['score'].values

In [6]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
x_text = tfidf.fit_transform(df['text_clean'])

In [7]:
x_final = hstack([x_numeric, x_text])

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x_final, y, test_size=0.2, random_state=2)
print(f"Train size: {x_train.shape}, Test size: {x_test.shape}")

Train size: (19782, 5006), Test size: (4946, 5006)


In [9]:
model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=2)
model.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
y_pred = model.predict(x_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")

RMSE: 0.38
R2 Score: 0.65


In [17]:
joblib.dump(model, "/Smart_Essay_Scorer/models/Random_forest_model.pkl")
joblib.dump(tfidf, "/Smart_Essay_Scorer/models/tfidf_vectorizer.pkl")
print("✅ Model and TF-IDF saved!")

✅ Model and TF-IDF saved!


In [12]:
from xgboost import XGBRegressor

model_xg = XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    tree_method='hist',
    random_state=2
)

model_xg.fit(x_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [13]:
y_pred = model_xg.predict(x_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")

RMSE: 0.36
R2 Score: 0.66


In [14]:
import lightgbm as lgb

model_lgb = lgb.LGBMRegressor(
    n_estimators=200,
    max_depth=-1,
    learning_rate=0.1,
    n_jobs=-1,
    random_state=42
)
model_lgb.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.447551 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 370380
[LightGBM] [Info] Number of data points in the train set: 19782, number of used features: 4960
[LightGBM] [Info] Start training from score 2.940249


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,200
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [15]:
y_pred = model_lgb.predict(x_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")

RMSE: 0.35
R2 Score: 0.68




In [39]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def clean_essay(text, remove_stopwords=True, lemmatize=True):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z\s]', '', text)
    if remove_stopwords:
        text = ' '.join([w for w in text.split() if w not in ENGLISH_STOP_WORDS])
    if lemmatize:
        text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
    return text

new_essay = "Renewable energy sources like solar and wind power are essential for a sustainable future. By reducing dependence on fossil fuels, we can lower greenhouse gas emissions and protect the environment. Governments and individuals must invest in clean energy solutions to ensure a healthier planet for future generations."
new_essay_clean = clean_essay(new_essay)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Faiz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Load model & vectorizer
model = joblib.load("E:/Smart_Essay_Scorer/models/Random_forest_model.pkl")
tfidf = joblib.load("E:/Smart_essay_Scorer/models/tfidf_vectorizer.pkl")

# Numeric features
word_count = len(new_essay_clean.split())
sent_count = new_essay_clean.count('.') + 1
words_per_sentence = word_count / max(sent_count,1)
char_count = len(new_essay_clean)
avg_word_length = char_count / max(word_count,1)

X_numeric_new = np.array([[word_count, sent_count, word_count, words_per_sentence, char_count, avg_word_length]])
X_text_new = tfidf.transform([new_essay_clean])
X_new_final = hstack([X_numeric_new, X_text_new])

# Predict
predicted_score = model_lgb.predict(X_new_final)
print("Predicted Score:", predicted_score[0])
predicted_score_rounded = round(predicted_score[0])
print("Rounded Predicted Score:", predicted_score_rounded)

Predicted Score: 1.747526327728755
Rounded Predicted Score: 2




In [22]:
joblib.dump(model_xg, "/Smart_Essay_Scorer/models/model_xg.pkl")
joblib.dump(model_lgb, "/Smart_Essay_Scorer/models/model_lgb.pkl")

['/Smart_Essay_Scorer/models/model_lgb.pkl']