In [1]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package
import pandas as pd
import spacy
from tqdm import tqdm
from src.processing.text_cleaning import normalize_text, process_contractions, remove_all_punctuation, remove_emojis, remove_html_unescape
from src.processing.text_processing import tokenize_comment

pd.set_option('display.max_colwidth', None)

In [2]:
# import data from csv
raw_data = pd.read_csv("data/raw/new_character_reveal_comments.csv")
data = raw_data.copy()
df = pd.DataFrame(data)
df.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,First. Now where is LEI WULONG?!
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,Already seen it. Ur getting less view&#39;s now bamco
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,Oww yeaah
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,I hope we get an angel version of Jin


In [3]:
df_cleaned = df.copy()

In [4]:
# clean the text
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(remove_html_unescape)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(remove_emojis)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(remove_all_punctuation)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(process_contractions)
df_cleaned['textDisplay'] = df_cleaned['textDisplay'].apply(normalize_text)

In [5]:
# load trained pipeline 
# nlp = spacy.load("en_core_web_sm")

In [6]:
# tokenize the text
# %%time
df_tokenized = df_cleaned.copy()
df_tokenized["textTokenized"] = df_cleaned['textDisplay'].apply(tokenize_comment)
df_tokenized.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textTokenized
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,first now where is lei wulong,"[first, now, where, is, lei, wulong]"
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now bamco,"[already, seen, it, you, are, getting, less, views, now, bamco]"
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,[wow]
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,"[oww, yeaah]"
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,i hope we get an angel version of jin,"[i, hope, we, get, an, angel, version, of, jin]"


# Lemmatize

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
df_lemmatized = df_tokenized.copy()

In [13]:
def lemmatize_comment(text):
    """Uses spaCy to lemmatize (return the base word) for a given text input.

    NOTE: "nlp = spacy.load("en_core_web_sm")" needs to be defined outside of the function.

    Parameters
    ----------
    text : str
        A string of text.

    Returns
    -------
    lemmas : list
        The lemmatized versions (base words) of the input text.
    
    """
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    
    return lemmas

In [15]:
df_lemmatized["textLemmatized"] = df_lemmatized["textDisplay"].apply(lemmatize_comment)
df_lemmatized.head(20)

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textTokenized,textLemmatized
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,first now where is lei wulong,"[first, now, where, is, lei, wulong]","[first, now, where, be, lei, wulong]"
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now bamco,"[already, seen, it, you, are, getting, less, views, now, bamco]","[already, see, it, you, be, get, less, view, now, bamco]"
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,[wow],[wow]
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,"[oww, yeaah]","[oww, yeaah]"
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,i hope we get an angel version of jin,"[i, hope, we, get, an, angel, version, of, jin]","[I, hope, we, get, an, angel, version, of, jin]"
5,rDxrpSqYHD8,@kazamataurus337,2023-11-01 16:10:08+00:00,2023-11-01 16:10:08+00:00,1,0,so it begins,"[so, it, begins]","[so, it, begin]"
6,rDxrpSqYHD8,@XKAMIKAZEEX,2023-11-01 16:10:10+00:00,2023-11-01 16:10:10+00:00,0,0,let us go,"[let, us, go]","[let, we, go]"
7,rDxrpSqYHD8,@bluefacebaby4167,2023-11-01 16:10:10+00:00,2023-11-01 16:10:10+00:00,1,0,miguel waiting room right here,"[miguel, waiting, room, right, here]","[miguel, waiting, room, right, here]"
8,rDxrpSqYHD8,@subzerodark,2023-11-01 16:10:11+00:00,2023-11-01 16:10:11+00:00,1,0,wow,[wow],[wow]
9,rDxrpSqYHD8,@therealwnd1820,2023-11-01 16:10:13+00:00,2023-11-01 16:10:13+00:00,0,0,yo,[yo],[yo]


# Experiment with lemmatizing

In [16]:
nlp = spacy.load("en_core_web_sm")
test_text = "Here he and she are going for a running jumping jumpy jumped jumpety tidied tidiest " \
            "tidy time timed timely in the bestest best weather today for a running great run ran runned "\
            "outside in the sunny sunniest sunshine you have ever seen sung"
doc = nlp(test_text)

In [17]:
print(test_text)

Here he and she are going for a running jumping jumpy jumped jumpety tidied tidiest tidy time timed timely in the bestest best weather today for a running great run ran runned outside in the sunny sunniest sunshine you have ever seen sung


In [18]:
lemmas = [token.lemma_ for token in doc]
print(lemmas)

['here', 'he', 'and', 'she', 'be', 'go', 'for', 'a', 'run', 'jump', 'jumpy', 'jump', 'jumpety', 'tidy', 'tidy', 'tidy', 'time', 'time', 'timely', 'in', 'the', 'best', 'good', 'weather', 'today', 'for', 'a', 'running', 'great', 'run', 'ran', 'runne', 'outside', 'in', 'the', 'sunny', 'sunny', 'sunshine', 'you', 'have', 'ever', 'see', 'sung']
