In [None]:
# Install required libraries
!pip install nltk pyspellchecker tensorflow numpy pandas autocorrect
!pip install --upgrade --no-cache-dir nltk
!pip install --upgrade --force-reinstall nltk


In [None]:

# Import required libraries
import nltk
import statistics
import numpy as np
import pandas as pd
import tensorflow as tf
from spellchecker import SpellChecker
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Flatten
from tensorflow.keras.models import Sequential
import tensorflow.keras.backend as K


# Download NLTK data files
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

# Set dataset path and load data
DATASET_PATH = '/content/dataset.csv'

In [None]:

# Load the dataset
X1 = pd.read_csv(DATASET_PATH, encoding='ISO-8859-1')

# Replace 'domain1_score' with the correct target column if known# For now, this example assumes there's a placeholder column called 'score' for demonstration
# Replace 'score' with the actual target column name in your dataset
y1 = X1['Overall']  # Update 'score' to your actual target column

# Feature Engineering Functions
def char_count(data):
    return len(data.lower().replace(' ',''))

def word_count(data):
    return len(nltk.word_tokenize(data))

def sent_count(data):
    return len(nltk.tokenize.sent_tokenize(data))

def avg_word_sentence(data):
    words_per_sentence = [len(nltk.word_tokenize(sent)) for sent in nltk.sent_tokenize(data)]
    return np.mean(words_per_sentence)

def avg_length(words):
    words_list = nltk.word_tokenize(words)
    return sum(len(word) for word in words_list) / len(words_list)

def spelling_errors(data):
    spell = SpellChecker()
    words = nltk.word_tokenize(data)
    misspelled = spell.unknown(words)
    return len(misspelled)

# POS Tagging for Nouns, Verbs, Adjectives, Adverbs
def pos_tags(data):
    tokens = nltk.word_tokenize(data)
    pos_tags = nltk.pos_tag(tokens)
    noun_count = sum(1 for word, pos in pos_tags if pos.startswith('N'))
    adj_count = sum(1 for word, pos in pos_tags if pos.startswith('J'))
    verb_count = sum(1 for word, pos in pos_tags if pos.startswith('V'))
    adv_count = sum(1 for word, pos in pos_tags if pos.startswith('R'))
    return noun_count, adj_count, verb_count, adv_count

# Applying feature engineering to the dataset
X1['num_chars'] = X1['Essay'].apply(char_count)
X1['num_words'] = X1['Essay'].apply(word_count)
X1['num_sents'] = X1['Essay'].apply(sent_count)
X1['avg_word_length'] = X1['Essay'].apply(avg_length)
X1['avg_word_sent'] = X1['Essay'].apply(avg_word_sentence)
X1['spelling_errors'] = X1['Essay'].apply(spelling_errors)
X1['noun_count'], X1['adj_count'], X1['verb_count'], X1['adv_count'] = zip(*X1['Essay'].map(pos_tags))
