# Character Prediction

## Word to Vector and Bag of Words dataframe creation

In [1]:
import pandas as pd
import numpy as np
import spacy
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marcu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Importing Data 
train_df = pd.read_csv('train_df.csv')
val_df = pd.read_csv('val_df.csv')
test_df = pd.read_csv('test_df.csv')


In [3]:
# Preprocessing for y dataset and common labels
imp_char = ["FRODO", "SAM", "GANDALF", "PIPPIN", "MERRY", "GOLLUM", "GIMLI", "THEODEN", "FARAMIR", "ARAGORN"]

def common_label_removal(data):
    mask = data["char"].isin(imp_char)
    data.loc[~ mask, "char"] = "Rest"
    mask2 = data['char'] == 'Rest'
    data = data[~mask2]
    return data

def y_split(data):
    y_data = data['char']
    return y_data

def char_2_num(y_data):
    encoder = LabelEncoder()
    y_data = y_data.values.reshape(-1, 1)
    encoded_data = encoder.fit_transform(y_data)
    names = list(encoder.inverse_transform(np.unique(encoded_data)))
    print(names)
    print(np.unique(encoded_data))
    return encoded_data


def preprocessing(data):
    data = common_label_removal(data)
    y_data = y_split(data)
    y_data = char_2_num(y_data)
    return pd.Series(y_data)

train_y = preprocessing(train_df)
val_y = preprocessing(val_df)
test_y = preprocessing(test_df)

train_y.to_csv('train_y.csv', index=False)
val_y.to_csv('val_y.csv', index=False)
test_y.to_csv('test_y.csv', index=False)

['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'THEODEN']
[0 1 2 3 4 5 6 7 8 9]
['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'THEODEN']
[0 1 2 3 4 5 6 7 8 9]
['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'THEODEN']
[0 1 2 3 4 5 6 7 8 9]


## Word 2 Vector Dataset

This idea was ultimatrly not used but was still apart of our research process.

In [4]:
# nlp = spacy.load("en_core_web_sm")

# def find_max_length(data):
#     return max(max(len(nlp(dialogue)) for dialogue in set) for set in data)

# max_length = find_max_length([train_df['dialog'], val_df['dialog'], test_df['dialog']])

# def word2vec_seq(data, max_length):
#     # Extracting norm vector values
#     word_vectors = []
#     for dialogue in data['dialog']:
#         tokens = nlp(dialogue)
#         dialogue_vectors = [token.vector_norm for token in tokens]
#         word_vectors.append(dialogue_vectors)

#     # Padding 
#     for i in range(len(word_vectors)):
#         word_vectors[i] += [100] * (max_length - len(word_vectors[i]))

#     df = pd.DataFrame(word_vectors)
#     df.columns = [f"word_{i}" for i in range(1, max_length + 1)]

#     df = pd.concat([data, df], axis=1)
#     return df  

# train_B = word2vec_seq(train_df, max_length)
# val_B = word2vec_seq(val_df, max_length)
# test_B = word2vec_seq(test_df, max_length)

In [5]:
# train_B.to_csv('train_B.csv', index=False)
# val_B.to_csv('val_B.csv', index=False)
# test_B.to_csv('test_B.csv', index=False)

### Word Embedding Dataset

In [6]:
train_B = common_label_removal(train_df).reset_index(drop=True)
val_B = common_label_removal(val_df).reset_index(drop=True)
test_B = common_label_removal(test_df).reset_index(drop=True)

In [7]:
def quote_list(X):
    quote_list = []
    for quote in range(len(X)):
        splitted_quote =  X['dialog'][quote].split()
        sequence_list = []
        for split in range(len(splitted_quote)):
            splitted_word = splitted_quote[split]

            word_list = str()
            i=0
            while i < (len(splitted_word)):
                # print(splitted_word[i])|
                if splitted_word[i].isalpha() == True:
                    word_list += splitted_word[i]
                i+=1
            sequence_list.append(word_list)
        quote_list.append(sequence_list)
    return quote_list

B1 = quote_list(train_B)
B2 = quote_list(val_B)
B3 = quote_list(test_B)

def maxlen(X):
    uni = []
    for i in range(len(X)):
        for j in range(len(X[i])):
            if X[i][j] not in uni:
                uni.append(X[i][j])
    return len(uni)
max_length = maxlen(B1)

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(B1)
B1_seq = tokenizer.texts_to_sequences(B1)
B2_seq = tokenizer.texts_to_sequences(B2)
B3_seq = tokenizer.texts_to_sequences(B3)
maxlen = max([len(seq) for seq in B1_seq])

B1_padseq = pad_sequences(B1_seq, maxlen=max_length,padding='post')
B2_padseq = pad_sequences(B2_seq, maxlen=maxlen,padding='post')
B3_padseq = pad_sequences(B3_seq, maxlen=maxlen,padding='post')

In [9]:
train_B = pd.DataFrame(B1_padseq)
val_B = pd.DataFrame(B2_padseq)
test_B = pd.DataFrame(B3_padseq)

train_B.to_csv('train_B.csv', index=False)
val_B.to_csv('val_B.csv', index=False)
test_B.to_csv('test_B.csv', index=False)

## Bag of Words Dataset

In [10]:

stopwords_set = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")

# Removing; punctuation, stopwords and stemming words
def bow_preprocess(df):
    preprocessed = []
    
    for dialog in df['dialog']:
        tokens = re.sub(r"[^a-zA-Z0-9]", " ", dialog).split()
        alfa = [stemmer.stem(word.lower()) for word in tokens if word not in stopwords_set]
        bravo = str()
        for i in range(len(alfa)):
            bravo += str(alfa[i])
            bravo += ' '
        preprocessed.append(bravo)

    df['dialog'] = preprocessed
    return df


train_C = bow_preprocess(train_df)
val_C = bow_preprocess(val_df)
test_C = bow_preprocess(test_df)

# Creating a set of unique words amongst all dialoges
all_dialogs = pd.concat([train_C['dialog'], val_C['dialog'], test_C['dialog']], ignore_index=True)
wordset = set([word for dialog in all_dialogs for word in dialog.split()])

# Creating dataframe
def BOW_df(wordset, df):

    bow_df = pd.DataFrame(columns=list(wordset))
    
    for i, dialog in enumerate(df['dialog']):
        dialog_tf_diz = {}
        for word in dialog.split():
            if word in wordset:
                if word in dialog_tf_diz:
                    dialog_tf_diz[word] += 1
                else:
                    dialog_tf_diz[word] = 1
        
        bow_df.loc[i] = [dialog_tf_diz.get(word, 0) for word in wordset]
    return pd.concat([df, bow_df], axis=1)

train_C = BOW_df(wordset, train_C)
val_C = BOW_df(wordset, val_C)
test_C = BOW_df(wordset, test_C)

In [11]:
train_C = common_label_removal(train_C).reset_index(drop=True).iloc[:,2:]
val_C = common_label_removal(val_C).reset_index(drop=True).iloc[:,2:]
test_C = common_label_removal(test_C).reset_index(drop=True).iloc[:,2:]

train_C.to_csv('train_C.csv', index=False)
val_C.to_csv('val_C.csv', index=False)
test_C.to_csv('test_C.csv', index=False)

## TF_IDF Dataset

In [12]:

def tf_idf__df(df):
    preprocessed_dialogs = bow_preprocess(df)
    tfidf_vectorizer = TfidfVectorizer(vocabulary=wordset)
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_dialogs['dialog'])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=list(wordset))
    return pd.concat([df, tfidf_df], axis=1)

train_D = tf_idf__df(train_df)
val_D = tf_idf__df(val_df)
test_D = tf_idf__df(test_df)

In [13]:
train_D = common_label_removal(train_D).reset_index(drop=True).iloc[:,2:]
val_D = common_label_removal(val_D).reset_index(drop=True).iloc[:,2:]
test_D = common_label_removal(test_D).reset_index(drop=True).iloc[:,2:]

train_D.to_csv('train_D.csv', index=False)
val_D.to_csv('val_D.csv', index=False)
test_D.to_csv('test_D.csv', index=False)

In [14]:
train_D

Unnamed: 0,draught,broil,ra,flee,holiday,ril,say,youshouldn,town,start,...,outof,heathen,pike,speed,wonder,7,withdraw,gold,green,silenc
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
