In [None]:
import nltk
import string
import re
import gensim
import inflect
import pandas as pd
from typing import Optional

from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer # another 2 kinds of Stemmer (Snowball and Lancaster)

from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /home/leonardo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/leonardo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/leonardo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/leonardo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/leonardo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [13]:
df = pd.read_csv('dataset/imdb_text2vec.csv')
# regex
YEAR_RE = re.compile(r"\b((?:18|19|20)\d{2}|[1-9]0['’]?s)\b")         
RATING_RE = re.compile(r'\b\d+(\.\d+)?\s*/\s*10\b')
NUM_RE = re.compile(r'\b\d+(\.\d+)?\b')
SPACES_RE = re.compile(r'\s*([?!.,]+(?:\s+[?!.,]+)*)\s*') #ciao,bella -> ciao, bella
HTML_RE = re.compile(r"<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>")
# other
p = inflect.engine()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

df.columns

Index(['id', 'sentiment', 'review'], dtype='object')

In [None]:

def _convert_nums(text:str,
        mx_splt_num:int = 1) -> str:

    '''
    We can also convert the numbers into words.
    This can be done by using the inflect library.
    
    mx_num_split, we do not want conversions, like 100-> one hundred,
    so we set the maximum number of words
    '''

    # assign <RATING>
    text = re.sub(RATING_RE,'<RATING>',text)
    #assign <YEAR> for 80s,90's, 1990...
    text = re.sub(YEAR_RE,'<YEAR>',text)

    # convert numbers below a threshold to string
    temp_str = text.split()
    new_string = []

    for w in temp_str:

        n2w = p.number_to_words(w) # if not a number(e.g. dog) it returns 'zero'
        cond = (w.isdigit()) and (len(n2w.split())<=mx_splt_num)
        
        to_append = n2w if cond else w
        new_string.append(to_append)

    # Other Numbers become <NUM>
    text = re.sub(NUM_RE,'<NUM>',' '.join(new_string))

    return text


def _remove_simple(text:str) -> str:

    """Does simple cleaning of HTML, and no spaces after commas"""

    # remove html patterns
  
    text = re.sub(HTML_RE, '', text)

    # remove issues like: ciao,bella
    text = SPACES_RE.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), text)

    return text

def _remove_punct(text:str,
                  to_keep:Optional[str] = "") -> str:
    '''
    We remove punctuations so that we don't have different forms of the same word.

    '''
    punctuation = ''.join(ch for ch in string.punctuation if ch not in to_keep)
    translator = str.maketrans('', '', punctuation)

    return text.translate(translator)


def _remove_stopwords(text:str,
                      other_stopwords:list[str]|str = None,
                      to_keep:list[str]|str = None,
                      lst_output:bool=False) -> list[str]:

    '''
    Stopwords are words that do not contribute much to the meaning of a sentence
    hence they can be removed. The NLTK library has a set of 
    stopwords and we can use these to remove stopwords from our text.     
    '''
    
    stop_words = set(stopwords.words("english"))

    if other_stopwords: stop_words.update(set(other_stopwords))
    if to_keep: stop_words.discard(set(to_keep))

    word_tokens = word_tokenize(text)
    clean = [w for w in word_tokens if not w.lower() in stop_words]
    return clean if lst_output else ' '.join(clean)

    
def _stem_words(text:list[str]|str,
                lst_output:bool=True) -> list[str]:
    '''
    Stemming is the process of getting the root form of a word.
    Stem or root is the part to which affixes like -ed, -ize, -de, -s, etc are added.
    The stem of a word is created by removing the prefix or suffix of a word.
    
    '''
    word_tokens = text if isinstance(text,list[str]) else word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems if lst_output else ' '.join(stems)


def _lemma_words(text:str,
                 lst_output:bool=True) -> list[str]:
    '''
    Lemmatization is an NLP technique that reduces a word to its root form.
    This can be helpful for tasks such as text analysis and search as it
    allows us to compare words that are related but have different forms.
    
    '''
    lemmas = [lemmatizer.lemmatize(word) for word in word_tokenize(text)]
    return lemmas if lst_output else ' '.join(lemmas)

def _pos_tagging(text):
    '''
    POS tagging is the process of assigning each word in a sentence its grammatical category,
    such as noun, verb, adjective or adverb. It helps machines understand the structure 
    and meaning of text, enabling tasks like parsing, information extraction and text analysis.

    NNP: Proper noun
    NN: Noun (singular)
    VBZ: Verb (3rd person singular)
    CC: Conjunction


    '''
    word_tokens = word_tokenize(text)
    return pos_tag(word_tokens)

def preprocess_df(df:pd.DataFrame
                  ) -> pd.DataFrame:
    '''
    Perform cleaning on reviews:
    https://www.geeksforgeeks.org/machine-learning/text-preprocessing-in-python-set-1/

    '''
    # 0.  x.lower().split() -> lower case and remove extra spaces, 
    # remove html and ciao,bella->ciao, bella
    df['review'] = df['review'].apply(lambda x: _remove_simple(" ".join(x.lower().split())))

    # 1 remove the punctuation but / to identify ratings
    first_layer_cleaner = lambda x: _remove_punct(x,'/')
    df['first_l_cl'] = df['review'].apply(first_layer_cleaner)

    # 2 convert nums below 100 to string, or ratings like 7.5/10 -> <RATING>, other NUM
    second_layer_cleaner = lambda x: _convert_nums(x)
    df['second_l_cl'] = df['first_l_cl'].apply(second_layer_cleaner)

    # 3 remove the rest
    third_layer_cleaner = lambda x: _remove_punct(x)
    df['third_l_cl'] = df['second_l_cl'].apply(third_layer_cleaner)

    #4 Lemmas 
    df['lemmas'] = df['third_l_cl'].apply(
        lambda x: _lemma_words(x, lst_output=True)  
    )
    return df

    

In [None]:
# it takes a bit to process, but i m too lazy to optimize 

cleaned = preprocess_df(df)

cleaned[]

Unnamed: 0,id,sentiment,review,first_l_cl,second_l_cl,third_l_cl
0,5814_8,1,with all this stuff going down at the moment w...,with all this stuff going down at the moment w...,with all this stuff going down at the moment w...,with all this stuff going down at the moment w...
1,2381_9,1,"\""the classic war of the worlds\"" by timothy h...",the classic war of the worlds by timothy hines...,the classic war of the worlds by timothy hines...,the classic war of the worlds by timothy hines...
2,7759_3,0,the film starts with a manager (nicholas bell)...,the film starts with a manager nicholas bell g...,the film starts with a manager nicholas bell g...,the film starts with a manager nicholas bell g...
3,3630_4,0,it must be assumed that those who praised this...,it must be assumed that those who praised this...,it must be assumed that those who praised this...,it must be assumed that those who praised this...
4,9495_8,1,superbly trashy and wondrously unpretentious 8...,superbly trashy and wondrously unpretentious 8...,superbly trashy and wondrously unpretentious <...,superbly trashy and wondrously unpretentious Y...
5,8196_8,1,i dont know why people think this is such a ba...,i dont know why people think this is such a ba...,i dont know why people think this is such a ba...,i dont know why people think this is such a ba...
6,7166_2,0,"this movie could have been very good, but come...",this movie could have been very good but comes...,this movie could have been very good but comes...,this movie could have been very good but comes...
7,10633_1,0,i watched this video at a friend's house. i'm ...,i watched this video at a friends house im gla...,i watched this video at a friends house im gla...,i watched this video at a friends house im gla...
8,319_1,0,"a friend of mine bought this film for 1, and e...",a friend of mine bought this film for 1 and ev...,a friend of mine bought this film for one and ...,a friend of mine bought this film for one and ...
9,8713_10,1,"this movie is full of references. like \""mad m...",this movie is full of references like mad max ...,this movie is full of references like mad max ...,this movie is full of references like mad max ...
