In [12]:
import nltk
import string
import re
import inflect
import pandas as pd
from typing import Optional

from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer # another 2 kinds of Stemmer (Snowball and Lancaster)

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /home/leonardo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/leonardo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/leonardo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/leonardo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/leonardo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [13]:
df = pd.read_csv('dataset/imdb_text2vec.csv')
p = inflect.engine()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

df.columns

Index(['id', 'sentiment', 'review'], dtype='object')

In [14]:
def preprocess_df(df:pd.DataFrame
                  ) -> pd.DataFrame:
    '''
    Perform cleaning on reviews:
    https://www.geeksforgeeks.org/machine-learning/text-preprocessing-in-python-set-1/
    '''
    # stremline lower and removing blank spaces
    cleaner = lambda x: _remove_punct(_convert_nums(" ".join(x.lower().split())))

    df['clean'] = df['review'].apply(cleaner)

    return df


def _convert_nums(text:str) -> str:

    '''We can also convert the numbers into words.
    This can be done by using the inflect library.'''

    temp_str = text.split()
    new_string = []

    for w in temp_str:
        to_append = p.number_to_words(w) if w.isdigit() else w
        new_string.append(to_append)

    return ' '.join(new_string)


def _remove_punct(text:str
                  ) -> str:
    '''
    We remove punctuations so that we don't have different forms of the same word.
    For example if we don't remove the punctuation then been. been, been! will be treated separately.

    ADDED: fix spaces and html patterns
    '''
    # remove html patterns
    html_pattern = "<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>"
    text = re.sub(html_pattern, '', text)

    # remove issues like: ciao,bella
    reg_rule = re.compile(r'\s*([?!.,]+(?:\s+[?!.,]+)*)\s*')
    text = reg_rule.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), text)

    translator = str.maketrans('', '', string.punctuation)

    return text.translate(translator)

def _remove_stopwords(text:str,
                      other_stopwords:list[str]|str = None,
                      to_keep:list[str]|str = None,
                      lst_output:bool=False) -> list[str]:

    '''
    Stopwords are words that do not contribute much to the meaning of a sentence
    hence they can be removed. The NLTK library has a set of 
    stopwords and we can use these to remove stopwords from our text.     
    '''
    
    stop_words = set(stopwords.words("english"))

    if other_stopwords: stop_words.update(set(other_stopwords))
    if to_keep: stop_words.discard(set(to_keep))

    word_tokens = word_tokenize(text)
    clean = [w for w in word_tokens if not w.lower() in stop_words]
    return clean if lst_output else ' '.join(clean)

    
def _stem_words(text:list[str]|str,
                lst_output:bool=True) -> list[str]:
    '''
    Stemming is the process of getting the root form of a word.
    Stem or root is the part to which affixes like -ed, -ize, -de, -s, etc are added.
    The stem of a word is created by removing the prefix or suffix of a word.
    
    '''
    word_tokens = text if isinstance(text,list[str]) else word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems if lst_output else ' '.join(stems)


def _lemma_words(text:str,
                 lst_output:bool=True) -> list[str]:
    '''
    Lemmatization is an NLP technique that reduces a word to its root form.
    This can be helpful for tasks such as text analysis and search as it
    allows us to compare words that are related but have different forms.
    
    '''
    lemmas = [lemmatizer.lemmatize(word) for word in word_tokenize(text)]
    return lemmas if lst_output else ' '.join(lemmas)

def _pos_tagging(text):
    '''
    POS tagging is the process of assigning each word in a sentence its grammatical category,
    such as noun, verb, adjective or adverb. It helps machines understand the structure 
    and meaning of text, enabling tasks like parsing, information extraction and text analysis.

    NNP: Proper noun
    NN: Noun (singular)
    VBZ: Verb (3rd person singular)
    CC: Conjunction


    '''
    word_tokens = word_tokenize(text)
    return pos_tag(word_tokens)
    

In [None]:
cleaned = preprocess_df(df)
cleaned.head(10) #to change roman numbers?

Unnamed: 0,id,sentiment,review,clean
0,5814_8,1,With all this stuff going down at the moment w...,with all this stuff going down at the moment w...
1,2381_9,1,"\""The Classic War of the Worlds\"" by Timothy H...",the classic war of the worlds by timothy hines...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,the film starts with a manager nicholas bell g...
3,3630_4,0,It must be assumed that those who praised this...,it must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...,i dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come...",this movie could have been very good but comes...
7,10633_1,0,I watched this video at a friend's house. I'm ...,i watched this video at a friends house im gla...
8,319_1,0,"A friend of mine bought this film for 1, and e...",a friend of mine bought this film for 1 and ev...
9,8713_10,1,<br /><br />This movie is full of references. ...,this movie is full of references like mad max ...
