In [2]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package
import pandas as pd
import spacy
from tqdm import tqdm
from src.processing.text_cleaning import normalize_text, process_contractions, remove_all_punctuation, remove_emojis, remove_html_unescape 

pd.set_option('display.max_colwidth', None)

In [2]:
# import data from csv
raw_data = pd.read_csv("data/raw/new_character_reveal_comments.csv")
data = raw_data.copy()
df = pd.DataFrame(data)
df.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,First. Now where is LEI WULONG?!
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,Already seen it. Ur getting less view&#39;s now bamco
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,Oww yeaah
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,I hope we get an angel version of Jin


In [3]:
df_cleaned = df.copy()

In [4]:
# perform cleaning operations
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(remove_html_unescape)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(remove_emojis)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(remove_all_punctuation)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(process_contractions)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(normalize_text)

In [5]:
df_cleaned.tail()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay
2036,rDxrpSqYHD8,@muhammadrafaythaheem9731,2023-12-24 07:57:43+00:00,2023-12-24 07:57:43+00:00,0,0,maaaarveroooosssse
2037,rDxrpSqYHD8,@helikoptergezgini9728,2023-12-25 21:27:41+00:00,2023-12-25 21:27:41+00:00,0,0,what you call new is in the game for almost all tekken games are you kiddin me nothin new here where is eddy where are all the other great characters i m just watching a sinking game too bad after tekken 7 we got this its must be a joke
2038,rDxrpSqYHD8,@pureOwarrior,2023-12-26 19:34:16+00:00,2023-12-26 19:34:16+00:00,0,0,wished if this was lee actual rage art
2039,rDxrpSqYHD8,@369dakuza,2023-12-27 20:24:12+00:00,2023-12-27 20:24:12+00:00,0,0,marduk armor king common my mains are not in my main game like wtf i am not going to buy until ltheir dlc come out very disappointed
2040,rDxrpSqYHD8,@backtobaking4054,2023-12-28 01:27:40+00:00,2023-12-28 01:27:40+00:00,0,0,legends


# Tokenize

In [None]:
# load trained pipeline
nlp = spacy.load("en_core_web_sm")
doc = [nlp("Let us tokenize a bunch of words this is example 1"), nlp("sentence 2 for tokenizing")]
for value in doc:
    text_tokenized = []
    for token in value:
        tokenized = token.text
        text_tokenized.append(tokenized)
        
    print(text_tokenized)

In [None]:
# I'll just be passing a single string so don't need the double for loop
nlp = spacy.load("en_core_web_sm")
doc = nlp("Let us tokenize a bunch of words this is example 1")

tokenized_text = []

for token in doc:
    tokenized = token.text
    tokenized_text.append(tokenized)
    
print(tokenized_text)

In [37]:
df_tokenized = df_cleaned.copy()

In [None]:
def tokenize_comment(text):
    """Uses spaCy to tokenize the string passed as input.

    NOTE: "nlp = spacy.load("en_core_web_sm")" needs to be defined outside of the function.

    Parameters
    ----------
    text : str
        A string of text.

    Returns
    -------
    text_tokenized : list
        The input text as a list of tokens.
    
    """
    doc = nlp(text)
    tokenized_text = [token.text for token in doc]
    
    return tokenized_text

In [38]:
%%time
nlp = spacy.load("en_core_web_sm")
df_tokenized["textTokenized"] = df_tokenized['textDisplay'].apply(tokenize_comment)
df_tokenized.head()

CPU times: user 5.66 s, sys: 62.2 ms, total: 5.72 s
Wall time: 5.74 s


Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textTokenized
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,first now where is lei wulong,"[first, now, where, is, lei, wulong]"
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now bamco,"[already, seen, it, you, are, getting, less, views, now, bamco]"
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,[wow]
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,"[oww, yeaah]"
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,i hope we get an angel version of jin,"[i, hope, we, get, an, angel, version, of, jin]"


In [36]:
# now superseded using list comprehension
def tokenize_comment_old(text):
    """Uses spaCy to tokenize the string passed as input.

    NOTE: "nlp = spacy.load("en_core_web_sm")" needs to be defined outside of the function.

    Parameters
    ----------
    text : str
        A string of text.

    Returns
    -------
    text_tokenized : list
        The input text as a list of tokens.
    
    """
    doc = nlp(text)
    
    tokenized_text = []
    
    for index, token in enumerate(doc):
        tokenized = token.text
        tokenized_text.append(tokenized)
        
    return tokenized_text
    