# Character Prediction

## Word to Vector and Bag of Words dataframe creation

In [1]:
import pandas as pd
import spacy
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marcu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Importing Data 
train_df = pd.read_csv('train_df.csv')
val_df = pd.read_csv('val_df.csv')
test_df = pd.read_csv('test_df.csv')


## Word 2 Vector Dataset

In [3]:
nlp = spacy.load("en_core_web_sm")

def find_max_length(data):
    return max(max(len(nlp(dialogue)) for dialogue in set) for set in data)

max_length = find_max_length([train_df['dialog'], val_df['dialog'], test_df['dialog']])

def word2vec_seq(data, max_length):
    # Extracting norm vector values
    word_vectors = []
    for dialogue in data:
        tokens = nlp(dialogue)
        dialogue_vectors = [token.vector_norm for token in tokens]
        word_vectors.append(dialogue_vectors)

    # Padding 
    for i in range(len(word_vectors)):
        word_vectors[i] += [100] * (max_length - len(word_vectors[i]))

    df = pd.DataFrame(word_vectors)
    df.columns = [f"word_{i}" for i in range(1, max_length + 1)]
    return df  

train_B = word2vec_seq(train_df['dialog'], max_length)
val_B = word2vec_seq(val_df['dialog'], max_length)
test_B = word2vec_seq(test_df['dialog'], max_length)

In [4]:
train_B.to_csv('train_B.csv', index=False)
val_B.to_csv('val_B.csv', index=False)
test_B.to_csv('test_B.csv', index=False)

## Bag of Words Dataset

In [5]:

stopwords_set = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")

# Removing; punctuation, stopwords and stemming words
def bow_preprocess(df):
    preprocessed = []
    
    for dialog in df:
        tokens = re.sub(r"[^a-zA-Z0-9]", " ", dialog.lower()).split()
        filtered_tokens = [word for word in tokens if word not in stopwords_set]
        preprocessed.append([stemmer.stem(word) for word in filtered_tokens])
    
    return preprocessed

train_C = bow_preprocess(train_df['dialog'])
val_C = bow_preprocess(val_df['dialog'])
test_C = bow_preprocess(test_df['dialog'])

# Creating a set of unique words amongst all dialoges
all_dialogs = train_C + val_C + test_C
wordset = set([word for dialog in all_dialogs for word in dialog])

# Creating dataframe
def BOW_df(wordset, df):
    bow_df = pd.DataFrame(columns=list(wordset))
    
    for i, dialog in enumerate(df):
        dialog_tf_diz = {}
        
        for word in dialog:
            if word in wordset:
                if word in dialog_tf_diz:
                    dialog_tf_diz[word] += 1
                else:
                    dialog_tf_diz[word] = 1
        
        bow_df.loc[i] = [dialog_tf_diz.get(word, 0) for word in wordset]
    
    return bow_df

train_C = BOW_df(wordset, train_C)
val_C = BOW_df(wordset, val_C)
test_C = BOW_df(wordset, test_C)

In [6]:
train_C.to_csv('train_C.csv', index=False)
val_C.to_csv('val_C.csv', index=False)
test_C.to_csv('test_C.csv', index=False)