In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [13]:
df = pd.read_csv("/Smart_Essay_Scorer/data/processed/cleaned_essays.csv")

In [28]:
df.head()

Unnamed: 0,score,word_count,sent_count,word_count_capped,text_clean,words_per_sentence,char_count,avg_word_length
0,4,396,17,396,author suggests studying venus worthy dangerou...,23.294118,1331,3.361111
1,2,200,13,200,nasa fighting alble venus researching diffrent...,15.384615,516,2.58
2,3,371,31,371,evening star brightest point light sky night v...,11.967742,1221,3.291105
3,2,224,10,224,author support idea reading passage suggests v...,22.4,704,3.142857
4,2,219,7,219,author support idea state text strivivng meet ...,31.285714,776,3.543379


In [22]:
df['text_clean'] = df['text_clean'].str.strip().str.replace(r'\s+', ' ', regex=True)

In [None]:
import re
df['text_clean'] = df['text_clean'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
df['text_clean'] = df['text_clean'].apply(lambda x: ' '.join([w for w in x.split() if w not in ENGLISH_STOP_WORDS]))

In [25]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

df['text_clean'] = df['text_clean'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Faiz\AppData\Roaming\nltk_data...


In [26]:
# Average words per sentence
df['words_per_sentence'] = df['word_count'] / df['sent_count'].replace(0,1)

# Character count (optional)
df['char_count'] = df['text_clean'].apply(len)

# Average word length
df['avg_word_length'] = df['char_count'] / df['word_count'].replace(0,1)


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
x_text = tfidf.fit_transform(df['text_clean'])
print("TF-IDF shape:", x_text.shape)

TF-IDF shape: (24728, 5000)


### Combine numeric + text features

In [29]:
from scipy.sparse import hstack

# Numeric features
x_numeric = df[['word_count', 'sent_count', 'word_count_capped', 'words_per_sentence', 'avg_word_length']].values

# Combine with TF-IDF
X_final = hstack([x_numeric, x_text])

In [31]:
from scipy.sparse import save_npz
import numpy as np

save_npz("/Smart_Essay_Scorer/data/processed/X_final.npz", X_final)
np.save("/Smart_Essay_Scorer/data/processed/y.npy", df['score'].values)


In [32]:
df.head()

Unnamed: 0,score,word_count,sent_count,word_count_capped,text_clean,words_per_sentence,char_count,avg_word_length
0,4,396,17,396,author suggests studying venus worthy dangerou...,23.294118,1331,3.361111
1,2,200,13,200,nasa fighting alble venus researching diffrent...,15.384615,516,2.58
2,3,371,31,371,evening star brightest point light sky night v...,11.967742,1221,3.291105
3,2,224,10,224,author support idea reading passage suggests v...,22.4,704,3.142857
4,2,219,7,219,author support idea state text strivivng meet ...,31.285714,776,3.543379


In [34]:
df.to_csv("/Smart_Essay_Scorer/data/processed/essays_featured.csv", index=False)
print("✅ Processed dataset saved successfully!")

✅ Processed dataset saved successfully!
