In [1]:
import os
os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package

# general DS packages
import pandas as pd
import numpy as np

# cleaning and pre-processing
from src.processing.text_cleaning import (normalize_text, process_contractions, remove_all_punctuation, remove_emojis, 
remove_html_unescape, remove_href_pattern, remove_digits, remove_extra_whitespace, remove_website_links)
from src.processing.text_processing import (tokenize_comment, lemmatize_comment, remove_stop_words, remove_tekken_character_names_from_tokens, 
part_of_speech, part_of_speech_tag, part_of_speech_dependency, part_of_speech_shape, part_of_speech_alpha, part_of_speech_is_stop)

In [2]:
# import data from csv
raw_data = pd.read_csv("data/raw/new_character_reveal_comments.csv", )
data = raw_data.copy()
df = pd.DataFrame(data)
df.head()

Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,First. Now where is LEI WULONG?!
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,Already seen it. Ur getting less view&#39;s no...
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,Oww yeaah
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,I hope we get an angel version of Jin


In [3]:
%%time

# clean
df['textDisplay'] = df['textDisplay'].apply(normalize_text)
df['textDisplay'] = df['textDisplay'].apply(process_contractions)
df['textDisplay'] = df['textDisplay'].apply(remove_website_links)
df['textDisplay'] = df['textDisplay'].apply(remove_html_unescape)
df['textDisplay'] = df['textDisplay'].apply(remove_emojis)
df['textDisplay'] = df['textDisplay'].apply(remove_digits)
df['textDisplay'] = df['textDisplay'].apply(remove_all_punctuation)
df['textDisplay'] = df['textDisplay'].apply(remove_href_pattern)
df['textDisplay'] = df['textDisplay'].apply(remove_extra_whitespace)

# process
df["textStopWordsRemoved"] = df["textDisplay"].apply(remove_stop_words)
df["textTokenized"] = df['textStopWordsRemoved'].apply(tokenize_comment)
df["textLemmatized"] = df["textStopWordsRemoved"].apply(lemmatize_comment)
df["textTekkenCharactersRemoved"] = df["textLemmatized"].apply(remove_tekken_character_names_from_tokens)


# part of speech operations
(df["pos"], 
 df["posTag"],
 df["posDependency"],
 df["posShape"],
 df["posAlpha"],
 df["posStopWord"]) = (df["textStopWordsRemoved"].apply(part_of_speech),
                                 df["textStopWordsRemoved"].apply(part_of_speech_tag),
                                 df["textStopWordsRemoved"].apply(part_of_speech_dependency),
                                 df["textStopWordsRemoved"].apply(part_of_speech_shape),
                                 df["textStopWordsRemoved"].apply(part_of_speech_alpha),
                                 df["textStopWordsRemoved"].apply(part_of_speech_is_stop)
                                )
df.head()

CPU times: user 38.5 s, sys: 43.2 ms, total: 38.5 s
Wall time: 38.7 s


Unnamed: 0,videoId,authorDisplayName,publishedAt,updatedAt,likeCount,totalReplyCount,textDisplay,textStopWordsRemoved,textTokenized,textLemmatized,textTekkenCharactersRemoved,pos,posTag,posDependency,posShape,posAlpha,posStopWord
0,rDxrpSqYHD8,@silveriver9,2023-11-01 16:09:58+00:00,2023-11-01 16:10:43+00:00,4,4,first now where is lei wulong,lei wulong,"[lei, wulong]","[lei, wulong]",[],"[PROPN, NOUN]","[NNP, NN]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]"
1,rDxrpSqYHD8,@faizaanjaved7150,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,1,1,already seen it you are getting less views now...,seen getting views bamco,"[seen, getting, views, bamco]","[see, get, view, bamco]","[see, get, view, bamco]","[VERB, VERB, NOUN, NOUN]","[VBN, VBG, NNS, NNS]","[ROOT, xcomp, dobj, dobj]","[xxxx, xxxx, xxxx, xxxx]","[True, True, True, True]","[False, False, False, False]"
2,rDxrpSqYHD8,@TS-rw4lk,2023-11-01 16:10:05+00:00,2023-11-01 16:10:05+00:00,0,0,wow,wow,[wow],[wow],[wow],[INTJ],[UH],[ROOT],[xxx],[True],[False]
3,rDxrpSqYHD8,@ALONCAK,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,0,0,oww yeaah,oww yeaah,"[oww, yeaah]","[oww, yeaah]","[oww, yeaah]","[PROPN, PROPN]","[NNP, NNP]","[compound, ROOT]","[xxx, xxxx]","[True, True]","[False, False]"
4,rDxrpSqYHD8,@Rough_Estimates,2023-11-01 16:10:06+00:00,2023-11-01 16:10:06+00:00,135,14,i hope we get an angel version of jin,hope angel version jin,"[hope, angel, version, jin]","[hope, angel, version, jin]","[hope, version]","[PROPN, PROPN, PROPN, PROPN]","[NNP, NNP, NNP, NNP]","[compound, compound, compound, ROOT]","[xxxx, xxxx, xxxx, xxx]","[True, True, True, True]","[False, False, False, False]"


In [4]:
df.to_csv("data/processed/new_character_comments_processed_2.csv", index=False)

## Check if any of the tokens are single or only two letter words

In [6]:
df_less_than = df.copy()

In [7]:
less_than_3 = []
for i, v in enumerate(df_less_than["textTekkenCharactersRemoved"]):
    for token in v:
        if len(token) <=2:
            less_than_3.append(token)
print(less_than_3)

['yo', 'm', 'ps', 'st', 'go', 'vs', 'go', 'go', 'ok', 'go', 's', 'm', 'm', 'go', 'go', 'st', 'm', 's', 'dr', 'b', 'ak', 'm', 'd', 'os', 'tf', 'oh', 's', 'oh', 's', 's', 'd', 'd', 'go', 'fg', 'si', 'su', 'en', 'va', 'm', 'ii', 's', 'ps', 'b', 'f', 'xd', 'nt', 'dj', 'w', 's', 'oh', 's', 'ah', 'm', 'go', 's', 'iq', 'ak', 'br', 'm', 'nt', 's', 's', 'm', 'b', 'go', 'dr', 's', 'm', 'yo', 'sf', 'mk', 'ed', 's', 'l', 't', 'br', 'la', 'ii', 's', 's', 'va', 'go', 'm', 'ay', 'go', 'oh', 'go', 'go', 'mk', 's', '월에', '않네', 'm', 'go', 's', 've', 'fe', 'bb', 'yo', 'go', 'go', 's', 'tt', 'm', '_', 's', 'go', 'hi', '아', '뭐야', 'go', 'oh', 's', 'bb', 'ub', 's', 'm', 've', 's', 's', 't', 'ps', 'm', 't', 't', 's', 's', 'm', 'go', 'nt', 'm', 's', 't', 't', 'go', 'tl', 't', 'm', 'go', 'm', 'nt', 'nt', 'd', '와아', 'wu', 'm', 's', 'tf', 'из', 'p', 'go', 'qq', 'go', 'а', 'не', 'и', 'с', 'd', 'm', 'tt', 'go', 'go', 't', 'go', 'go', 'go', 'm', 's', 'sf', 'sc', 's', 'nt', 'sf', 's', 'm', 'ps', 'ps', 'og', 'oh', 'm'

There are loads of meaningless tokens that will clog up the model....remove them...

In [8]:
df_less_than["textTekkenCharactersRemoved"].iloc[5]

['begin']

In [9]:
for i, v in enumerate(df_less_than["textTekkenCharactersRemoved"]):
    more_than_2 = [token for token in v if len(token) >2]
    df_less_than["textTekkenCharactersRemoved"].iloc[i] = more_than_2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_less_than["textTekkenCharactersRemoved"].iloc[i] = more_than_2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_less_than["textTekkenCharactersRemoved"].iloc[i] = more_than_2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_less_than["textTekkenCharactersRemoved"].iloc[i] = more_than_2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ret

In [10]:
less_than_3 = []
for i, v in enumerate(df_less_than["textTekkenCharactersRemoved"]):
    for token in v:
        if len(token) <=2:
            less_than_3.append(token)
print(less_than_3)

[]


The above setting with copy warning should be eliminated when the code is functionalised and used with .apply()

In [11]:
def remove_tiny_tokens(tokens):
    """Removes tokens with less than 3 characters e.g., "fd", "a" etc.

    Parameters
    ----------
    tokens : list
        The list of tokens.

    Returns
    -------
    tokens : list
        The input list without tokens containing <=2 characters.
    
    """
    return [token for token in tokens if len(token) >2]
    

# Test function

In [13]:
df_test = df.copy()

In [15]:
less_than_3 = []
for i, v in enumerate(df_test["textTekkenCharactersRemoved"]):
    for token in v:
        if len(token) <=2:
            less_than_3.append(token)
print(less_than_3)

['yo', 'm', 'ps', 'st', 'go', 'vs', 'go', 'go', 'ok', 'go', 's', 'm', 'm', 'go', 'go', 'st', 'm', 's', 'dr', 'b', 'ak', 'm', 'd', 'os', 'tf', 'oh', 's', 'oh', 's', 's', 'd', 'd', 'go', 'fg', 'si', 'su', 'en', 'va', 'm', 'ii', 's', 'ps', 'b', 'f', 'xd', 'nt', 'dj', 'w', 's', 'oh', 's', 'ah', 'm', 'go', 's', 'iq', 'ak', 'br', 'm', 'nt', 's', 's', 'm', 'b', 'go', 'dr', 's', 'm', 'yo', 'sf', 'mk', 'ed', 's', 'l', 't', 'br', 'la', 'ii', 's', 's', 'va', 'go', 'm', 'ay', 'go', 'oh', 'go', 'go', 'mk', 's', '월에', '않네', 'm', 'go', 's', 've', 'fe', 'bb', 'yo', 'go', 'go', 's', 'tt', 'm', '_', 's', 'go', 'hi', '아', '뭐야', 'go', 'oh', 's', 'bb', 'ub', 's', 'm', 've', 's', 's', 't', 'ps', 'm', 't', 't', 's', 's', 'm', 'go', 'nt', 'm', 's', 't', 't', 'go', 'tl', 't', 'm', 'go', 'm', 'nt', 'nt', 'd', '와아', 'wu', 'm', 's', 'tf', 'из', 'p', 'go', 'qq', 'go', 'а', 'не', 'и', 'с', 'd', 'm', 'tt', 'go', 'go', 't', 'go', 'go', 'go', 'm', 's', 'sf', 'sc', 's', 'nt', 'sf', 's', 'm', 'ps', 'ps', 'og', 'oh', 'm'

In [16]:
df_test["textTekkenCharactersRemoved"] = df_test["textTekkenCharactersRemoved"].apply(remove_tiny_tokens)

In [17]:
less_than_3 = []
for i, v in enumerate(df_test["textTekkenCharactersRemoved"]):
    for token in v:
        if len(token) <=2:
            less_than_3.append(token)
print(less_than_3)

[]
