In [1]:
# Re-import libraries and redefine file paths
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

#  Preprocessing and Text Normalization


# Define file paths
train_file_path = r' Kaggle_Competitions\\Learning Agency Lab - Automated Essay Scoring 2.0\\learning-agency-lab-automated-essay-scoring-2\\train.csv'
test_file_path = r' Kaggle_Competitions\\Learning Agency Lab - Automated Essay Scoring 2.0\\learning-agency-lab-automated-essay-scoring-2\\test.csv'

# Load the data
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Display the first few rows of each dataframe and their structure
train_data.head(), train_data.columns, test_data.head(), test_data.columns

(  essay_id                                          full_text  score
 0  000d118  Many people have car where they live. The thin...      3
 1  000fe60  I am a scientist at NASA that is discussing th...      3
 2  001ab80  People always wish they had the same technolog...      4
 3  001bdc0  We all heard about Venus, the planet without a...      4
 4  002ba53  Dear, State Senator\n\nThis is a letter to arg...      3,
 Index(['essay_id', 'full_text', 'score'], dtype='object'),
   essay_id                                          full_text
 0  000d118  Many people have car where they live. The thin...
 1  000fe60  I am a scientist at NASA that is discussing th...
 2  001ab80  People always wish they had the same technolog...,
 Index(['essay_id', 'full_text'], dtype='object'))

# Preprocessing and Text Normalization
* Cleaning Text: Remove or normalize text artifacts like punctuation, capitalization, and special characters that might not contribute to essay scoring.
* Tokenization and Lemmatization: Break down text into tokens (words or phrases) and reduce them to their base or dictionary form.
* Stopword Removal: Consider the impact of removing common words that may not contribute to the overall meaning of the essay.

In [2]:
# Setting up NLTK with local resources
nltk.data.path.append('Kaggle_Competitions\\Learning Agency Lab - Automated Essay Scoring 2.0\\')  # Specifying a custom path for pre-loaded NLTK resources

# Load NLTK resources necessary for the tasks
nltk.download('punkt', download_dir='\\Kaggle_Competitions\\Learning Agency Lab - Automated Essay Scoring 2.0\\')  # Tokenizers
nltk.download('stopwords', download_dir=' Kaggle_Competitions\\Learning Agency Lab - Automated Essay Scoring 2.0\\')  # Stopwords
nltk.download('wordnet', download_dir=' Kaggle_Competitions\\Learning Agency Lab - Automated Essay Scoring 2.0\\')  # Lemmatizer

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lower case
    text = text.lower()
    # Remove non-alphabetic characters and extra spaces
    text = re.sub('[^a-z\s]', ' ', text)
    text = re.sub(' +', ' ', text).strip()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to string
    text = ' '.join(tokens)
    return text

# Apply preprocessing to both train and test data
train_data['clean_text'] = train_data['full_text'].apply(preprocess_text)
test_data['clean_text'] = test_data['full_text'].apply(preprocess_text)

# Display first few rows to verify preprocessing
train_data[['full_text', 'clean_text']].head(), test_data[['full_text', 'clean_text']].head()


[nltk_data] Downloading package punkt to C:\Users\nickr\OneDrive\Υπολο
[nltk_data]     γιστής\Repositories\Kaggle_Competitions\Learning
[nltk_data]     Agency Lab - Automated Essay Scoring 2.0\...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\nickr\OneDrive\Υ
[nltk_data]     πολογιστής\Repositories\Kaggle_Competitions\Learning
[nltk_data]     Agency Lab - Automated Essay Scoring 2.0\...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\nickr\OneDrive\Υπο
[nltk_data]     λογιστής\Repositories\Kaggle_Competitions\Learning
[nltk_data]     Agency Lab - Automated Essay Scoring 2.0\...
[nltk_data]   Package wordnet is already up-to-date!


### 2.Feature Engineering
* Linguistic Features: Extract features that represent the quality of writing, such as sentence complexity, vocabulary richness, grammar correctness, and coherence. Tools like the Natural Language Toolkit (NLTK) or spaCy can be helpful.
* Text Embeddings: Use embeddings like Word2Vec, GloVe, or fastText to capture semantic relationships between words. Sentence and paragraph embeddings (e.g., from BERT or Sentence-BERT) can capture contextual nuances.
* Syntactic Features: Parse trees and dependency graphs can help understand the syntactic structures of sentences, potentially indicating more complex writing abilities.