In [16]:
import re
import spacy
import en_core_web_sm

In [17]:

nlp = spacy.load("en_core_web_sm")


In [18]:

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip().lower()

def tokenize_pos(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    pos_tags = [token.pos_ for token in doc]
    return tokens, pos_tags

def spacy_summary(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    noun_chunks = [chunk.text for chunk in doc.noun_chunks]
    dependencies = [(token.text, token.dep_) for token in doc]
    return {"entities": entities, "noun_chunks": noun_chunks, "dependencies": dependencies}


In [19]:
from textstat import flesch_reading_ease, gunning_fog
from sentence_transformers import SentenceTransformer
import numpy as np

embedder = SentenceTransformer('all-MiniLM-L6-v2')

def readability_scores(text):
    return {
        "flesch": flesch_reading_ease(text),
        "fog": gunning_fog(text)
    }

def get_embeddings(text):
    return embedder.encode(text)

def build_feature_vector(text):
    scores = readability_scores(text)
    embedding = get_embeddings(text)
    return np.concatenate([list(scores.values()), embedding])





In [20]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
nlp = spacy.load("en_core_web_sm")

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [21]:
def build_feature_vector(text):
    tokens, pos = tokenize_pos(text)
    num_tokens = len(tokens)
    unique_tokens = len(set(tokens))
    avg_word_len = np.mean([len(t) for t in tokens]) if tokens else 0
    noun_count = pos.count("NOUN")
    verb_count = pos.count("VERB")

    return [
        num_tokens,
        unique_tokens,
        avg_word_len,
        noun_count,
        verb_count
    ]


In [22]:
df = pd.read_csv("../data/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')
print("Shape:", df.shape)
df.head(3)


Shape: (12976, 28)


Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,


In [23]:
df["cleaned_essay"] = df["essay"].apply(clean_text)
df["cleaned_essay"].head()

0    dear local newspaper i think effects computers...
1    dear caps1 caps2 i believe that using computer...
2    dear caps1 caps2 caps3 more and more people us...
3    dear local newspaper caps1 i have found that m...
4    dear location1 i know having computers has a p...
Name: cleaned_essay, dtype: object

In [24]:
from tqdm import tqdm
tqdm.pandas()

def extract_features(text):
    doc = nlp(text)

    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
    num_tokens = len(tokens)
    num_sentences = len(list(doc.sents))

    unique_tokens = set(tokens)
    type_token_ratio = len(unique_tokens) / num_tokens if num_tokens > 0 else 0

    pos_counts = {
        "noun_count": sum(1 for token in doc if token.pos_ == "NOUN"),
        "verb_count": sum(1 for token in doc if token.pos_ == "VERB"),
        "adj_count": sum(1 for token in doc if token.pos_ == "ADJ"),
        "adv_count": sum(1 for token in doc if token.pos_ == "ADV"),
    }

    return pd.Series({
        "word_count": num_tokens,
        "sentence_count": num_sentences,
        "type_token_ratio": type_token_ratio,
        **pos_counts
    })

features_df = df["cleaned_essay"].progress_apply(extract_features)
df = pd.concat([df, features_df], axis=1)
df.head(3)


100%|█████████████████████████████████████████████████████████████████████████| 12976/12976 [26:55:44<00:00,  7.47s/it]


Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater3_trait5,rater3_trait6,cleaned_essay,word_count,sentence_count,type_token_ratio,noun_count,verb_count,adj_count,adv_count
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,dear local newspaper i think effects computers...,341.0,2.0,0.483871,73.0,49.0,21.0,16.0
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,dear caps1 caps2 i believe that using computer...,422.0,2.0,0.459716,103.0,70.0,20.0,16.0
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,dear caps1 caps2 caps3 more and more people us...,282.0,2.0,0.524823,75.0,40.0,20.0,11.0


In [25]:
pip install -U sentence-transformers


Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting sentence-transformers
  Using cached sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Using cached sentence_transformers-4.0.2-py3-none-any.whl (340 kB)
Installing collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 3.3.1
    Uninstalling sentence-transformers-3.3.1:
      Successfully uninstalled sentence-transformers-3.3.1
Successfully installed sentence-transformers-4.0.2


DEPRECATION: Loading egg at c:\program files\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
essay_embeddings = model.encode(df["cleaned_essay"].tolist(), show_progress_bar=True)


Batches:   0%|          | 0/406 [00:00<?, ?it/s]

In [27]:
import numpy as np
X = np.hstack([essay_embeddings, df[[
    "word_count", "sentence_count", "type_token_ratio",
    "noun_count", "verb_count", "adj_count", "adv_count"
]].values])
y = df["domain1_score"].values


In [28]:
 pip install lightgbm


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at c:\program files\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [32]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

# Define and train the model
model = LGBMRegressor(
    objective='regression',
    learning_rate=0.05,
    num_leaves=64,
    n_estimators=1000,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    
)

# Predict and evaluate
y_pred = model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"Validation RMSE: {rmse:.3f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.110713 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98944
[LightGBM] [Info] Number of data points in the train set: 10380, number of used features: 391
[LightGBM] [Info] Start training from score 6.832563
Validation RMSE: 2.447




In [34]:
import joblib

joblib.dump(model, "C:/Users/DELL/Documents/aes/api/saved_model/essay_scorer.pkl")


['C:/Users/DELL/Documents/aes/api/saved_model/essay_scorer.pkl']