In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import os
import re
import nltk
import tensorflow
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
nltk.download('stopwords')
nltk.download('punkt')


train_df = pd.read_csv('/content/drive/MyDrive/346 project/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1', usecols=['essay_id', 'essay_set', 'essay', 'domain1_score']).dropna(axis=1)


stemmer = PorterStemmer()
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s.,\']', '', text)
    tokens = word_tokenize(text)
    stopwords_set = set(stopwords.words('english'))
    tokens = [stemmer.stem(word) for word in tokens if word not in stopwords_set]
    return ' '.join(tokens)

def preprocess_text2(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s.,\']', '', text)
    tokens = word_tokenize(text)
    stopwords_set = set(stopwords.words('english'))
    tokens = [stemmer.stem(word) for word in tokens if word not in stopwords_set]
    return tokens

train_df["essay_prepro"] = train_df["essay"].apply(preprocess_text)
X = train_df.drop(["essay_id", "essay_set", "essay", "domain1_score"], axis=1)
y = train_df["domain1_score"]
tokenized_documents = [preprocess_text2(doc) for doc in X["essay_prepro"]]

ukuran_vektor = 100
word2vec_model = Word2Vec(sentences=tokenized_documents, min_count=1, vector_size=ukuran_vektor, sg=1)
all_words = word2vec_model.wv.index_to_key

y = np.asarray(y)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=92)

def document_vector(word2vec_model, doc_tokens):
    doc_vector = np.zeros(word2vec_model.vector_size)
    num_words = 0
    for word in doc_tokens:
        try:
            doc_vector += word2vec_model.wv[word]
            num_words += 1
        except KeyError:
            continue
    if num_words != 0:
        doc_vector /= num_words
    return doc_vector

X_train_vec = np.array([document_vector(word2vec_model, doc.split()) for doc in X_train["essay_prepro"]])
X_val_vec = np.array([document_vector(word2vec_model, doc.split()) for doc in X_val["essay_prepro"]])

encoder = OneHotEncoder()
essay_set_train_encoded = encoder.fit_transform(train_df.loc[X_train.index, 'essay_set'].values.reshape(-1, 1)).toarray()
essay_set_val_encoded = encoder.transform(train_df.loc[X_val.index, 'essay_set'].values.reshape(-1, 1)).toarray()

X_train_combined = np.concatenate((X_train_vec, essay_set_train_encoded), axis=1)
X_val_combined = np.concatenate((X_val_vec, essay_set_val_encoded), axis=1)

model = Sequential()
model.add(Input(shape=(ukuran_vektor+8,)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(61, activation='softmax'))

optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
history = model.fit(X_train_combined, y_train, epochs=500, batch_size=256, validation_data=(X_val_combined, y_val), callbacks=[early_stopping])

loss, accuracy = model.evaluate(X_val_combined, y_val, verbose=0)
print(f'Accuracy: {accuracy*100:.2f}%')

# Plot loss
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot accuracy
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
plt.plot(accuracy, label='Training Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78