In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split as TTS,  GridSearchCV  
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB as NB


import nltk
from nltk.corpus import stopwords, sentiwordnet, wordnet
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

import spacy

from typing import List
from pprint import pprint


import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis.gensim

import tqdm

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping


import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marcu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
lotr_train = pd.read_csv('lotr_train.csv')
lotr_test = pd.read_csv('lotr_test.csv')
imp_char = ["FRODO", "SAM", "GANDALF", "PIPPIN", "MERRY", "GOLLUM", "GIMLI", "THEODEN", "FARAMIR", "SAURON", "ARAGORN", "SMEAGOL"]

def common_label_removal(data):
    mask = data["char"].isin(imp_char)
    data.loc[~ mask, "char"] = "Rest"
    mask2 = data['char'] == 'Rest'
    data = data[~mask2]
    return data

lotr_train = common_label_removal(lotr_train)
lotr_test = common_label_removal(lotr_test)

In [6]:
nlp = spacy.load("en_core_web_sm")

def find_max_length(data):
    return max(max(len(nlp(dialogue)) for dialogue in set) for set in data)

max_length = find_max_length([lotr_train['dialog'], lotr_test['dialog']])

def word2vec_df(data, max_length):
    # Extracting norm vector values
    word_vectors = []
    for dialogue in data:
        tokens = nlp(dialogue)
        dialogue_vectors = [token.vector_norm for token in tokens]
        word_vectors.append(dialogue_vectors)

    # Padding 
    for i in range(len(word_vectors)):
        word_vectors[i] += [100] * (max_length - len(word_vectors[i]))

    df = pd.DataFrame(word_vectors)
    df.columns = [f"word_{i}" for i in range(1, max_length + 1)]
    return df  

train_w2v = word2vec_df(lotr_train['dialog'], max_length)
test_w2v = word2vec_df(lotr_test['dialog'], max_length)

In [7]:
train_w2v.to_csv('train_w2v.csv', index=False)
test_w2v.to_csv('test_w2v.csv', index=False)