### Depedencies

In [197]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import re
import contractions
import spacy
import string
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from spellchecker import SpellChecker
pd.options.mode.chained_assignment = None

nltk.download('wordnet', quiet=True)
nlp = spacy.load('en_core_web_sm')

# Preprocessing
- Tokenizing
- Lowercasing
- Stop Word Removal
- Frequent Words Removal
- Rare Words Removal
- Stemming
- Lemmatization
- Chat Words Conversion
- Spelling Corrections

References: 
- [Getting started with Text Preprocessing](https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing)
- [NLP - Data Preprocessing and Cleaning](https://www.kaggle.com/code/colearninglounge/nlp-data-preprocessing-and-cleaning)

### Loading Dataset

In [198]:
complete_df = pd.read_csv(r"../../data/raw/top_150_fantasy_reviews.csv")
df = complete_df[["review"]]
df["review"] = df["review"].astype(str)
complete_df.head()

Unnamed: 0,review_id,anime_title,review_url,date,username,user_review_count,is_preliminary,episodes_watched,recommendation,rating,review,total_reactions,nice_count,love_it_count,funny_count,confusing_count,informative_count,well_written_count,creative_count
0,503754,Sousou no Frieren,https://myanimelist.net/reviews.php?id=503754,"Oct 13, 2023 8:38 AM",Czekaj,5,True,5/28,Recommended,10,"With lives so short, why do we even bother? To...",1341,280,829,43,58,5,124,2
1,519189,Sousou no Frieren,https://myanimelist.net/reviews.php?id=519189,"Mar 22, 2024 12:40 PM",chekkit,25,False,,Recommended,10,I feel so catered to.\n\r\nIt feels like an et...,1188,243,774,43,47,8,70,3
2,519472,Sousou no Frieren,https://myanimelist.net/reviews.php?id=519472,"Mar 24, 2024 2:03 AM",Trikkiez,3,False,,Not Recommended,4,Style-\r\nFrieren doesn't have its own unique ...,4111,612,100,1926,1318,28,116,11
3,512466,Sousou no Frieren,https://myanimelist.net/reviews.php?id=512466,"Jan 12, 2024 11:25 AM",ShabbaRico,12,True,18/28,Not Recommended,5,"TL;DR: 5/10, I don't recommend this for anyone...",915,180,27,395,261,9,41,2
4,503760,Sousou no Frieren,https://myanimelist.net/reviews.php?id=503760,"Oct 13, 2023 9:10 AM",TheRealist68,16,True,6/28,Mixed Feelings,9,"Through 3 episodes, Frieren appears to be a un...",949,410,60,31,312,10,122,4


### Lowercasing

In [199]:
df["review"] = df["review"].str.lower()
df.head()

Unnamed: 0,review
0,"with lives so short, why do we even bother? to..."
1,i feel so catered to.\n\r\nit feels like an et...
2,style-\r\nfrieren doesn't have its own unique ...
3,"tl;dr: 5/10, i don't recommend this for anyone..."
4,"through 3 episodes, frieren appears to be a un..."


### URLs and Emails Removal

In [200]:
def remove_urls(text):
    pattern = re.compile(r'http\S+|www\S+|\S+@\S+')
    return pattern.sub(r'', text)
df["review"] = df["review"].apply(remove_urls)
df.head()

Unnamed: 0,review
0,"with lives so short, why do we even bother? to..."
1,i feel so catered to.\n\r\nit feels like an et...
2,style-\r\nfrieren doesn't have its own unique ...
3,"tl;dr: 5/10, i don't recommend this for anyone..."
4,"through 3 episodes, frieren appears to be a un..."


### Mentions Removal

In [201]:
def remove_mentions(text):
    pattern = re.compile(r'@\w+')
    return pattern.sub(r'', text)
df["review"] = df["review"].apply(remove_mentions)
df.head()

Unnamed: 0,review
0,"with lives so short, why do we even bother? to..."
1,i feel so catered to.\n\r\nit feels like an et...
2,style-\r\nfrieren doesn't have its own unique ...
3,"tl;dr: 5/10, i don't recommend this for anyone..."
4,"through 3 episodes, frieren appears to be a un..."


### Non-Alphanumeric Removal

In [202]:
PUNCT_TO_REMOVE = string.punctuation.replace("'", "")  # remove apostrophes from punctuation
def remove_non_alphanumeric(text):
    pattern = re.compile(r"[^a-z'\s]")
    return pattern.sub(r'', text)

df['review'] = df['review'].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,review
0,with lives so short why do we even bother to s...
1,i feel so catered to\n\r\nit feels like an ete...
2,style\r\nfrieren doesn't have its own unique s...
3,tldr 510 i don't recommend this for anyone tha...
4,through 3 episodes frieren appears to be a uni...


### Chat Word Conversion

In [203]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

In [204]:
chat_words_map_dict = {}
chat_words_list = []

for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

df['review'] = df['review'].apply(lambda text: chat_words_conversion(text))
df["review"] = df["review"].str.lower()
df.head()

Unnamed: 0,review
0,with lives so short why do we even bother to s...
1,i feel so catered to it feels like an eternity...
2,style frieren doesn't have its own unique styl...
3,tldr 510 i don't recommend this for anyone tha...
4,through 3 episodes frieren appears to be a uni...


### Expand Contractions

In [205]:
df['review'] = df['review'].apply(lambda text: contractions.fix(text))
df.head()

Unnamed: 0,review
0,with lives so short why do we even bother to s...
1,i feel so catered to it feels like an eternity...
2,style frieren does not have its own unique sty...
3,tldr 510 i do not recommend this for anyone th...
4,through 3 episodes frieren appears to be a uni...


### Lemmatization

In [206]:
stop_words = set(stopwords.words("english"))
negation_words = {"no", "not", "nor", "never", "n't", "dont"}
stop_words = stop_words - negation_words
post_lemmatization_corrections = {
    "datum": "data",
    "cannot": "can_not",
    "dont": "do_not",
    "doesnt": "does_not",
    "wont": "will_not",
    "cant": "can_not",
    "isnt": "is_not",
    "wasnt": "was_not",
    "arent": "are_not"
}

def lemmatize_words(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""

    doc = nlp(text)
    tokens = []
    skip_next = False

    for i, token in enumerate(doc):
        if skip_next:
            skip_next = False
            continue

        lemma = token.lemma_.lower()
        lemma = post_lemmatization_corrections.get(lemma, lemma)

        # Preserve negation + meaningful word (negation tagging)
        if lemma in negation_words and i + 1 < len(doc):
            next_token = doc[i + 1]
            if next_token.pos_ in {"ADJ", "VERB", "ADV", "NOUN"}:
                next_lemma = next_token.lemma_.lower()
                next_lemma = post_lemmatization_corrections.get(next_lemma, next_lemma)
                tokens.append(f"{lemma}_{next_lemma}")
                skip_next = True
                continue
            else:
                tokens.append(lemma)
        elif lemma not in stop_words and token.is_alpha and len(lemma) > 1:
            tokens.append(lemma)

    return " ".join(tokens)
    
df["review"] = df["review"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,review
0,life short even bother someone live thousand y...
1,feel catered feel like eternity since give phe...
2,style frieren not_have unique style way feel l...
3,tldr not_recommend anyone standard enjoy fanta...
4,episode frieren appear unique masterpiece stor...


### Saving Cleaned Data

In [None]:
final_df = df[['review']].rename(columns={'review': 'processed_review'})
final_df.to_csv(r"..\data\processed\reviews_cleaned.csv", index=False)

# Unused Preprocessing Steps

### Frequent Words Removal
Note: 
- If we use something like tfidf, this is automatically taken care of.
- Shown the process below however, not taken into account for steps to clean the data

In [208]:
cnt = Counter()
for review in df["review"]:
    for word in review.split():
        cnt[word] += 1
cnt.most_common(10)

[('character', 23326),
 ('one', 14208),
 ('anime', 14110),
 ('show', 13788),
 ('like', 13200),
 ('story', 13164),
 ('make', 11710),
 ('well', 10016),
 ('good', 9990),
 ('series', 9690)]

In [209]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["review"] = df["review"].apply(lambda text: remove_freqwords(text))
df.head()

Unnamed: 0,review
0,life short even bother someone live thousand y...
1,feel catered feel eternity since give phenomen...
2,style frieren not_have unique style way feel e...
3,tldr not_recommend anyone standard enjoy fanta...
4,episode frieren appear unique masterpiece stor...


### Rare Words Removal
Note:
- Shown the process below however, not taken into account for steps to clean the data

In [210]:
n_rare_words = 10
cnt.most_common()[:-n_rare_words-1:-1]

[('not_jack', 2),
 ('reaccure', 2),
 ('not_episodic', 2),
 ('summaryoverall', 2),
 ('easternesque', 2),
 ('backstorydevelopment', 2),
 ('saveprotect', 2),
 ('notre', 2),
 ('shirō', 2),
 ('not_in', 2)]

In [211]:
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["review"] = df["review"].apply(lambda text: remove_rarewords(text))
df.head()

Unnamed: 0,review
0,life short even bother someone live thousand y...
1,feel catered feel eternity since give phenomen...
2,style frieren not_have unique style way feel e...
3,tldr not_recommend anyone standard enjoy fanta...
4,episode frieren appear unique masterpiece stor...


### Stemming
Note:
- Shown the process below however, not taken into account for steps to clean the data

In [212]:
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["review"] = df["review"].apply(lambda text: stem_words(text))
df.head()

Unnamed: 0,review
0,life short even bother someon live thousand ye...
1,feel cater feel etern sinc give phenomen think...
2,style frieren not_hav uniqu style way feel eve...
3,tldr not_recommend anyon standard enjoy fantas...
4,episod frieren appear uniqu masterpiec storyte...


Some words such as `this` became `thi` and `why` became `whi` which is not intended. To get around this, Lemmatization are used in such cases