In [1]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
from src.processing.text_cleaning import normalize_text, process_contractions, remove_all_punctuation, remove_emojis, remove_html_unescape, remove_digits
from src.processing.text_processing import tokenize_comment, lemmatize_comment, remove_stop_words

pd.set_option('display.max_colwidth', None)

In [2]:
# import data from csv
raw_data = pd.read_csv("data/raw/new_character_reveal_comments.csv")
data = raw_data.copy()
df = pd.DataFrame(data)
df.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,First. Now where is LEI WULONG?!
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,Already seen it. Ur getting less view&#39;s now bamco
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,Oww yeaah
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,I hope we get an angel version of Jin


### Clean comments

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
df_cleaned = df.copy()
# clean the text
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(remove_html_unescape)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(remove_emojis)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(remove_all_punctuation)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(process_contractions)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(normalize_text)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(remove_digits)

df.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,First. Now where is LEI WULONG?!
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,Already seen it. Ur getting less view&#39;s now bamco
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,Oww yeaah
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,I hope we get an angel version of Jin


### Process comments

In [5]:
# remove stop words, tokenize, and lemmatize the text
df_processed = df_cleaned.copy()
df_processed["textStopWordsRemoved"] = df_processed["textDisplay"].apply(remove_stop_words)
df_processed["textTokenized"] = df_processed['textStopWordsRemoved'].apply(tokenize_comment)
df_processed["textLemmatized"] = df_processed["textStopWordsRemoved"].apply(lemmatize_comment)
df_processed.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textStopWordsRemoved,textTokenized,textLemmatized
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,first now where is lei wulong,lei wulong,"[lei, wulong]","[lei, wulong]"
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now bamco,seen getting views bamco,"[seen, getting, views, bamco]","[see, get, view, bamco]"
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,wow,[wow],[wow]
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,oww yeaah,"[oww, yeaah]","[oww, yeaah]"
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,i hope we get an angel version of jin,hope angel version jin,"[hope, angel, version, jin]","[hope, angel, version, jin]"


# remove character names

In [33]:
df_processed.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textStopWordsRemoved,textTokenized
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,first now where is lei wulong,lei wulong,"[lei, wulong]"
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now bamco,seen getting views bamco,"[seen, getting, views, bamco]"
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,wow,[wow]
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,oww yeaah,"[oww, yeaah]"
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,i hope we get an angel version of jin,hope angel version jin,"[hope, angel, version, jin]"


In [None]:
tekken_8_characters = [
"Alisa Bosconovich", "Asuka Kazama", "Azucena Ortiz (New)", "Bryan Fury", "Claudio Serafino", "Devil Jin",
"Feng Wei", "Hwoarang", "Jack-8", "Jin Kazama", "Jun Kazama", "Kazuya Mishima", "King", "Kuma", "Lars Alexandersson",
"Lee Chaolan", "Leo Kliesen", "Leroy Smith", "Lili De Rochefort", "Ling Xiaoyu", "Marshall Law", "Nina Williams",
"Panda", "Raven", "Reina (New)", "Sergei Dragunov", "Shaheen", "Steve Fox", "Victor Chevalier (New)", "Yoshimitsu",
"Zafina"]

In [48]:
all_tekken_characters = ['Alex', 'Alisa Bosconovich', 'Angel', 'Anna Williams', 'Armor King', 'Asuka Kazama', 
                         'Ayane', 'Azazel', 'Baek Doo San', 'Bruce Irvin', 'Bryan Fury', 'Christie Monteiro', 
                         'Claudio Serafino', 'Combot', 'Cyclops', 'Debug', 
                         'Devil Jin', 'Eddy Gordo', 'Eliza', 'Feng Wei', 'Forest Law', 'Ganryu', 'Gigas', 
                         'Gon', 'Heihachi Mishima', 'Hwoarang', 'Jack', 'Jack-7', 'Jack-8', 'Jin Kazama', 'Julia Chang', 
                         'Jun Kazama', 'Kazumi Mishima', 'King', 'Kuma', 'Kunimitsu', 'Doctor Bosconovitch', 
                         'Dragunov, Sergei', 'Eddy Gordo', 'Fahkumram', 'Geese Howard', 'Jinpachi Mishima', 'Josie Rizal', 
                         'Katarina Alves', 'Lee Chaolan', 'Leo Kliesen', 'Lili De Rochefort', 'Ling Xiaoyu', 'Lucky Chloe', 'Leroy Smith', 
                         'Lidia Sobieska', 'Master Raven', 'Michelle Chang', 'Miguel Rojo', 'Mokujin', 'Nancy-MI847J', 
                         'Negan', 'Nina Williams', 'Noctis', 'Ogre', 'True Ogre', 
                         'Marshall Law', 'Panda', 'Paul Phoenix', 'Rachel', 'Roger Jr', "Sake", 
                         'Steve Fox', 'Tekken Force Soldier', 
                         "Violet", 'Wang Jinrei', 'Yoshimitsu', 'Zafina', 'Azucena Ortiz', 'Reina', 'Victor Chevalier']

In [49]:
all_tekken_characters = sorted(all_tekken_characters)
print(all_tekken_characters)

['Alex', 'Alisa Bosconovich', 'Angel', 'Anna Williams', 'Armor King', 'Asuka Kazama', 'Ayane', 'Azazel', 'Azucena Ortiz', 'Baek Doo San', 'Bruce Irvin', 'Bryan Fury', 'Christie Monteiro', 'Claudio Serafino', 'Combot', 'Cyclops', 'Debug', 'Devil Jin', 'Doctor Bosconovitch', 'Dragunov, Sergei', 'Eddy Gordo', 'Eddy Gordo', 'Eliza', 'Fahkumram', 'Feng Wei', 'Forest Law', 'Ganryu', 'Geese Howard', 'Gigas', 'Gon', 'Heihachi Mishima', 'Hwoarang', 'Jack', 'Jack-7', 'Jack-8', 'Jin Kazama', 'Jinpachi Mishima', 'Josie Rizal', 'Julia Chang', 'Jun Kazama', 'Katarina Alves', 'Kazumi Mishima', 'King', 'Kuma', 'Kunimitsu', 'Lee Chaolan', 'Leo Kliesen', 'Leroy Smith', 'Lidia Sobieska', 'Lili De Rochefort', 'Ling Xiaoyu', 'Lucky Chloe', 'Marshall Law', 'Master Raven', 'Michelle Chang', 'Miguel Rojo', 'Mokujin', 'Nancy-MI847J', 'Negan', 'Nina Williams', 'Noctis', 'Ogre', 'Panda', 'Paul Phoenix', 'Rachel', 'Reina', 'Roger Jr', 'Sake', 'Steve Fox', 'Tekken Force Soldier', 'True Ogre', 'Victor Chevalier', '

In [55]:
len(all_tekken_characters)

76

In [52]:
characters_split_names = []
for index, character in enumerate(all_tekken_characters):
    split_name = character.split(" ")
    characters_split_names.extend(split_name)

print(characters_split_names)

['Alex', 'Alisa', 'Bosconovich', 'Angel', 'Anna', 'Williams', 'Armor', 'King', 'Asuka', 'Kazama', 'Ayane', 'Azazel', 'Azucena', 'Ortiz', 'Baek', 'Doo', 'San', 'Bruce', 'Irvin', 'Bryan', 'Fury', 'Christie', 'Monteiro', 'Claudio', 'Serafino', 'Combot', 'Cyclops', 'Debug', 'Devil', 'Jin', 'Doctor', 'Bosconovitch', 'Dragunov,', 'Sergei', 'Eddy', 'Gordo', 'Eddy', 'Gordo', 'Eliza', 'Fahkumram', 'Feng', 'Wei', 'Forest', 'Law', 'Ganryu', 'Geese', 'Howard', 'Gigas', 'Gon', 'Heihachi', 'Mishima', 'Hwoarang', 'Jack', 'Jack-7', 'Jack-8', 'Jin', 'Kazama', 'Jinpachi', 'Mishima', 'Josie', 'Rizal', 'Julia', 'Chang', 'Jun', 'Kazama', 'Katarina', 'Alves', 'Kazumi', 'Mishima', 'King', 'Kuma', 'Kunimitsu', 'Lee', 'Chaolan', 'Leo', 'Kliesen', 'Leroy', 'Smith', 'Lidia', 'Sobieska', 'Lili', 'De', 'Rochefort', 'Ling', 'Xiaoyu', 'Lucky', 'Chloe', 'Marshall', 'Law', 'Master', 'Raven', 'Michelle', 'Chang', 'Miguel', 'Rojo', 'Mokujin', 'Nancy-MI847J', 'Negan', 'Nina', 'Williams', 'Noctis', 'Ogre', 'Panda', 'Paul'

In [53]:
characters_lower = []
for i in characters_split_names:
    lower_cased_name = i.lower()
    characters_lower.append(lower_cased_name)
print(characters_lower)

['alex', 'alisa', 'bosconovich', 'angel', 'anna', 'williams', 'armor', 'king', 'asuka', 'kazama', 'ayane', 'azazel', 'azucena', 'ortiz', 'baek', 'doo', 'san', 'bruce', 'irvin', 'bryan', 'fury', 'christie', 'monteiro', 'claudio', 'serafino', 'combot', 'cyclops', 'debug', 'devil', 'jin', 'doctor', 'bosconovitch', 'dragunov,', 'sergei', 'eddy', 'gordo', 'eddy', 'gordo', 'eliza', 'fahkumram', 'feng', 'wei', 'forest', 'law', 'ganryu', 'geese', 'howard', 'gigas', 'gon', 'heihachi', 'mishima', 'hwoarang', 'jack', 'jack-7', 'jack-8', 'jin', 'kazama', 'jinpachi', 'mishima', 'josie', 'rizal', 'julia', 'chang', 'jun', 'kazama', 'katarina', 'alves', 'kazumi', 'mishima', 'king', 'kuma', 'kunimitsu', 'lee', 'chaolan', 'leo', 'kliesen', 'leroy', 'smith', 'lidia', 'sobieska', 'lili', 'de', 'rochefort', 'ling', 'xiaoyu', 'lucky', 'chloe', 'marshall', 'law', 'master', 'raven', 'michelle', 'chang', 'miguel', 'rojo', 'mokujin', 'nancy-mi847j', 'negan', 'nina', 'williams', 'noctis', 'ogre', 'panda', 'paul'

In [54]:
len(characters_lower)

124

In [37]:
def remove_tekken_character_names_from_tokens(tokens: list):
    """Removes Tekken character names from the comments.

    Parameters
    ----------
    df : pandas dataframe

    Returns
    -------
    df : pandas dataframe
        The input dataframe
    
    """
    tokens_without_character_names = []
    
    tekken_character_names = ['alex', 'alisa', 'bosconovich', 'angel', 'anna', 'williams', 'armor', 'king', 'asuka', 'kazama', 'ayane', 
                              'azazel', 'azucena', 'bob', 'Richard', 'ortiz', 'baek', 'doo', 'san', 'bruce', 'irvin', 'bryan', 'fury', 'christie', 
                              'monteiro', 'claudio', 'serafino', 'combot', 'cyclops', 'debug', 'devil', 'jin', 'doctor', 
                              'bosconovitch', 'dragunov,', 'sergei', 'eddy', 'gordo', 'eddy', 'gordo', 'eliza', 'fahkumram', 'feng', 'wei', 'forest', 
                              'law', 'ganryu', 'geese', 'howard', 'gigas', 'gon', 'heihachi', 'mishima', 'hwoarang', 'jack', 'jack-7', 
                              'jack-8', 'jin', 'kazama', 'jinpachi', 'pachi', 'mishima', 'josie', 'rizal', 'julia', 'chang', 'jun', 'kazama', 
                              'katarina', 'alves', 'kazumi', 'mishima', 'king', 'kuma', 'kunimitsu', 'lee', 'chaolan', 'leo','kliesen', 
                              'leroy', 'smith', 'lidia', 'sobieska', 'lili', 'de', 'rochefort', 'ling', 'xiaoyu', 'lucky', 'chloe', 
                              'marshall', 'law', 'master', 'raven', 'michelle', 'chang', 'miguel', 'rojo', 'mokujin', 'nancy-mi847j', 
                              'negan', 'nina', 'williams', 'noctis', 'ogre', 'panda', 'paul', 'phoenix', 'rachel', 'reina', 'roger', 'jr', 
                              'sake', 'steve', 'fox', 'tekken', 'force', 'soldier', 'true', 'ogre', 'trueogre', 'victor', 'chevalier', 'violet', 
                              'wang', 'jinrei', 'yoshimitsu', "yoshi", 'zafina', 'lei', 'wulong', 'craig', 'marduk']

    
    filtered_tokens = [word for word in tokens if word not in tekken_character_names]
    
    tokens_without_character_names.extend(filtered_tokens)

    return filtered_tokens

In [41]:
df_processed["textTekkenCharactersRemoved"] = df_processed["textLemmatized"].apply(remove_character_names_from_tokens)
df_processed.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textStopWordsRemoved,textTokenized,textLemmatized,textTekkenCharactersRemoved
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,first now where is lei wulong,lei wulong,"[lei, wulong]","[lei, wulong]",[]
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now bamco,seen getting views bamco,"[seen, getting, views, bamco]","[see, get, view, bamco]","[see, get, view, bamco]"
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,wow,[wow],[wow],[wow]
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,oww yeaah,"[oww, yeaah]","[oww, yeaah]","[oww, yeaah]"
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,i hope we get an angel version of jin,hope angel version jin,"[hope, angel, version, jin]","[hope, angel, version, jin]","[hope, version]"


# Parts of Speech - 
### tagging, dependency, shape, is_alpha, is_stop

In [6]:
def part_of_speech(text):
    """Uses spaCy to return the simple (universal) Part of Speech tag (noun, adjective, verb etc.) for a given text input.

    NOTE: "nlp = spacy.load("en_core_web_sm")" needs to be defined outside of the function.

    Parameters
    ----------
    text : str
        A string of text.

    Returns
    -------
    pos : list
        The simple part of speech for each token of the input text.
    
    """
    doc = nlp(text)
    pos = [token.pos_ for token in doc]
    
    return pos

In [7]:
def part_of_speech_tag(text):
    """Uses spaCy to return the detailed part-of-speech tag Part of Speech tag for a given text input.

    NOTE: "nlp = spacy.load("en_core_web_sm")" needs to be defined outside of the function.

    Parameters
    ----------
    text : str
        A string of text.

    Returns
    -------
    pos_tags : list
        The part of speech tags for each token of the input text.
    
    """
    doc = nlp(text)
    pos_tags = [token.tag_ for token in doc]
    
    return pos_tags

In [8]:
def part_of_speech_dependency(text):
    """Uses spaCy to return the syntactic dependency, i.e. the relation between tokens
    for a given text input.

    NOTE: "nlp = spacy.load("en_core_web_sm")" needs to be defined outside of the function.

    Parameters
    ----------
    text : str
        A string of text.

    Returns
    -------
    dep_tags : list
        The dependency tags for each token of the input text.
    
    """
    doc = nlp(text)
    dep_tags = [token.dep_ for token in doc]
    
    return dep_tags

In [9]:
def part_of_speech_shape(text):
    """Uses spaCy to return the word shape – capitalization, punctuation, digits.

    NOTE: "nlp = spacy.load("en_core_web_sm")" needs to be defined outside of the function.

    Parameters
    ----------
    text : str
        A string of text.

    Returns
    -------
    shape : list
        The shape of the token e.g., xxxxx for lower case (e.g., apple), Xxxxx for capital followed 
        by 4 lower case characters (e.g., Apple), X.X. for something like U.K.
    
    """
    doc = nlp(text)
    shape = [token.shape_ for token in doc]
    
    return shape

In [10]:
def part_of_speech_alpha(text):
    """Uses spaCy to return a boolean value indicating whether the token is an alphanumeric character.

    NOTE: "nlp = spacy.load("en_core_web_sm")" needs to be defined outside of the function.

    Parameters
    ----------
    text : str
        A string of text.

    Returns
    -------
    alpha : list
        The dependency tags for each token of the input text.
    
    """
    doc = nlp(text)
    alpha = [token.is_alpha for token in doc]
    
    return alpha

In [11]:
def part_of_speech_is_stop(text):
    """Uses spaCy to return a boolean value indicating if the token is a 'stop word'.

    NOTE: "nlp = spacy.load("en_core_web_sm")" needs to be defined outside of the function.

    Parameters
    ----------
    text : str
        A string of text.

    Returns
    -------
    stop_word : list
        A true or false value dependent on whether the word is or is not a stop word.
    
    """
    doc = nlp(text)
    stop_word = [token.is_stop for token in doc]
    
    return stop_word

In [22]:
def part_of_speech_entity_type(text):
    """Uses spaCy to return the entity type of each word.

    NOTE: "nlp = spacy.load("en_core_web_sm")" needs to be defined outside of the function.

    Parameters
    ----------
    text : str
        A string of text.

    Returns
    -------
    entity_type : list
        The entity type of the word e.g., 'DATE', 
    
    """
    doc = nlp(text)
    entity_type = [token.ent_type_ for token in doc]
    
    return entity_type

# Create dataframe with functions

In [12]:
(df_processed["pos"], 
 df_processed["posTag"],
 df_processed["posDependency"],
 df_processed["posShape"],
 df_processed["posAlpha"],
 df_processed["posStopWord"]) = (df_processed["textStopWordsRemoved"].apply(part_of_speech),
                                 df_processed["textStopWordsRemoved"].apply(part_of_speech_tag),
                                 df_processed["textStopWordsRemoved"].apply(part_of_speech_dependency),
                                 df_processed["textStopWordsRemoved"].apply(part_of_speech_shape),
                                 df_processed["textStopWordsRemoved"].apply(part_of_speech_alpha),
                                 df_processed["textStopWordsRemoved"].apply(part_of_speech_is_stop)
                                )
df_processed.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textStopWordsRemoved,textTokenized,textLemmatized,pos,posTag,posDependency,posShape,posAlpha,posStopWord
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,first now where is lei wulong,lei wulong,"[lei, wulong]","[lei, wulong]","[PROPN, NOUN]","[NNP, NN]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]"
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now bamco,seen getting views bamco,"[seen, getting, views, bamco]","[see, get, view, bamco]","[VERB, VERB, NOUN, NOUN]","[VBN, VBG, NNS, NNS]","[ROOT, xcomp, dobj, dobj]","[xxxx, xxxx, xxxx, xxxx]","[True, True, True, True]","[False, False, False, False]"
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,wow,[wow],[wow],[INTJ],[UH],[ROOT],[xxx],[True],[False]
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,oww yeaah,"[oww, yeaah]","[oww, yeaah]","[PROPN, PROPN]","[NNP, NNP]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]"
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,i hope we get an angel version of jin,hope angel version jin,"[hope, angel, version, jin]","[hope, angel, version, jin]","[PROPN, PROPN, PROPN, PROPN]","[NNP, NNP, NNP, NNP]","[compound, compound, compound, ROOT]","[xxxx, xxxx, xxxx, xxx]","[True, True, True, True]","[False, False, False, False]"


In [30]:
df_processed.to_csv("data/processed/new_character_reveal_processed.csv",  index=False)

# Test functions

In [None]:
df_processed["pos"] = df_processed["textStopWordsRemoved"].apply(part_of_speech)
df_processed.head(3)

In [None]:
print(spacy.explain("PROPN"))
print(spacy.explain("INTJ"))

In [None]:
df_processed["posTag"] = df_processed["textStopWordsRemoved"].apply(part_of_speech_tag)
df_processed.head(3)

In [None]:
print(spacy.explain("NN"))
print(spacy.explain("NNP"))
print(spacy.explain("NNS"))
print(spacy.explain("UH"))
print(spacy.explain("VBG"))

In [None]:
df_processed["posDependency"] = df_processed["textStopWordsRemoved"].apply(part_of_speech_dependency)
df_processed.head(3)

In [None]:
print(spacy.explain("compound"))
print(spacy.explain("ROOT"))
print(spacy.explain("xcomp"))
print(spacy.explain("dobj"))

In [None]:
df_processed["posShape"] = df_processed["textStopWordsRemoved"].apply(part_of_speech_shape)
df_processed.head(3)

In [None]:
df_processed["posAlpha"] = df_processed["textStopWordsRemoved"].apply(part_of_speech_alpha)
df_processed.head(3)

In [None]:
df_processed["posStopWord"] = df_processed["textStopWordsRemoved"].apply(part_of_speech_is_stop)
df_processed.head(3)

In [23]:
df_processed["textEntityType"] = df_processed["textStopWordsRemoved"].apply(part_of_speech_entity_type)
df_processed.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textStopWordsRemoved,textTokenized,textLemmatized,pos,posTag,posDependency,posShape,posAlpha,posStopWord,textEntityType
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,first now where is lei wulong,lei wulong,"[lei, wulong]","[lei, wulong]","[PROPN, NOUN]","[NNP, NN]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]","[, ]"
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now bamco,seen getting views bamco,"[seen, getting, views, bamco]","[see, get, view, bamco]","[VERB, VERB, NOUN, NOUN]","[VBN, VBG, NNS, NNS]","[ROOT, xcomp, dobj, dobj]","[xxxx, xxxx, xxxx, xxxx]","[True, True, True, True]","[False, False, False, False]","[, , , ]"
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,wow,[wow],[wow],[INTJ],[UH],[ROOT],[xxx],[True],[False],[]
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,oww yeaah,"[oww, yeaah]","[oww, yeaah]","[PROPN, PROPN]","[NNP, NNP]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]","[PERSON, PERSON]"
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,i hope we get an angel version of jin,hope angel version jin,"[hope, angel, version, jin]","[hope, angel, version, jin]","[PROPN, PROPN, PROPN, PROPN]","[NNP, NNP, NNP, NNP]","[compound, compound, compound, ROOT]","[xxxx, xxxx, xxxx, xxx]","[True, True, True, True]","[False, False, False, False]","[, , , ]"


Entity type doesn't seem to work very well.