# **Install necessary libraries:**

In [None]:
!pip install -q pandas scikit-learn nltk spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# **Load the Dataset and Data Preprocessing:**

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/bigdata/ASAP2_train.csv')

# Check the column names in the DataFrame
print(df.columns)

# Display first few rows to inspect the data
df.head()

# Replace 'essay' with the actual column name containing the essays
# For instance, if the column is named 'essay_text', change the code as follows:
df = df.dropna(subset=['full_text']) # Replace 'essay_text' with the correct column name
df['score'] = df['score'].astype(int)

# Check the data types
df.info()

Index(['essay_id', 'score', 'full_text', 'assignment', 'prompt_name',
       'economically_disadvantaged', 'student_disability_status', 'ell_status',
       'race_ethnicity', 'gender', 'grade_level', 'essay_word_count'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   essay_id                    17307 non-null  object 
 1   score                       17307 non-null  int64  
 2   full_text                   17307 non-null  object 
 3   assignment                  17289 non-null  object 
 4   prompt_name                 17289 non-null  object 
 5   economically_disadvantaged  13288 non-null  object 
 6   student_disability_status   13288 non-null  object 
 7   ell_status                  17067 non-null  object 
 8   race_ethnicity              17288 non-null  object 
 9   gender          

# **Text Preprocessing (Tokenization, Lemmatization):**

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Text cleaning function
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

df['cleaned_full_text'] = df['full_text'].apply(preprocess_text)
df.head()


Unnamed: 0,essay_id,score,full_text,assignment,prompt_name,economically_disadvantaged,student_disability_status,ell_status,race_ethnicity,gender,grade_level,essay_word_count,cleaned_full_text
0,AAAVUP14319000159574,4,The author suggests that studying Venus is wor...,"In ""The Challenge of Exploring Venus,"" the aut...",Exploring Venus,Economically disadvantaged,Identified as having disability,No,Black/African American,F,10.0,409.0,author suggest study venus worthy dangerous au...
1,AAAVUP14319000159542,2,NASA is fighting to be alble to to go to Venus...,"In ""The Challenge of Exploring Venus,"" the aut...",Exploring Venus,Not economically disadvantaged,Not identified as having disability,No,Hispanic/Latino,F,10.0,197.0,nasa fight alble venus research diffrent metho...
2,AAAVUP14319000159420,2,The author supports this idea because from rea...,"In ""The Challenge of Exploring Venus,"" the aut...",Exploring Venus,Economically disadvantaged,Not identified as having disability,Yes,Hispanic/Latino,F,10.0,209.0,author support idea read passage suggest venus...
3,AAAVUP14319000159419,2,How the author supports this idea is that he s...,"In ""The Challenge of Exploring Venus,"" the aut...",Exploring Venus,Economically disadvantaged,Not identified as having disability,Yes,Hispanic/Latino,M,10.0,214.0,author support idea state text strivivng meet ...
4,AAAVUP14319000159395,1,In the story of ¨The Challenge of Exploring Ve...,"In ""The Challenge of Exploring Venus,"" the aut...",Exploring Venus,Economically disadvantaged,Identified as having disability,No,Black/African American,M,10.0,284.0,story ¨the challenge explore venus¨ venus brig...


# **Feature Extraction using TF-IDF:**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned essays
X = vectorizer.fit_transform(df['cleaned_full_text']).toarray()

# Target variable (essay scores)
y = df['score']

# **Model Training (Random Forest Regressor):**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


# **Model Evaluation:**

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R2 Score: {r2}')


Mean Squared Error: 0.509033679953784
R2 Score: 0.5187995149343433


# **Making Predictions:**

In [None]:
# Example: Predict score for a new essay
new_essay = "The quick brown fox jumps over the lazy dog."
new_essay = preprocess_text(new_essay)
new_essay_vector = vectorizer.transform([new_essay]).toarray()
predicted_score = model.predict(new_essay_vector)
print(f"Predicted Score: {predicted_score[0]}")


Predicted Score: 2.0
