In [2]:
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
os.chdir("/Users/kausshik/HON322M/Misc_LLM")

In [None]:
data = pd.read_csv('Hotel_review.csv', index_col=0)

In [None]:
data.head()

## Pre-processing
1. ***Tokenization*** - The process of breaking down the text into individual words or "tokens".

2. ***Stopwords Removal*** - Eliminating frequently occurring but less meaningful words (e.g., "the," "and," "is") from text data

3. ***Stemming and Lemmatization*** - reduce words to their base or root form. 

    - ***Stemming*** - Simplify words by removing suffixes (e.g., running, runs, and run), but it might result in non-words (e.g., "leaves" -> "leav"). 
    
    - ***Lemmatization*** - on the other hand, transforms words into their original form while retaining their meaning (e.g., "leaves" -> "leaf").

5. ***Vectorization*** - The conversion of text into a numerical format, such as vectors. A common approach is the bag-of-words model, where a matrix is created to store word frequencies (word counts) for each document or text in the corpus. This process is often referred to as the vectorization of the raw text.


### Tokenization


In [None]:
import nltk 

In [None]:
nltk.download('punkt')

In [None]:
# if only apply split to segregate words
first_text = data['text'][0]

print(first_text)
print("="*90)
print(first_text.split(" "))

"comma and period" are not seperated as one term; e.g., "proposal."

nltk library "word_tokenize()": split singular words as well as puctuations into separate elements

In [None]:
# using nltk.word_tokenize()
first_text_list = nltk.word_tokenize(first_text)

print(first_text_list)
# now it's well seprated.

Another popular pre-processing library for text data is `spacy`.

### Stopword Removal
Stop words include terms such as "to" or "the" and therefore, it would be to our benefit to remove them during the pre-processing phase.

NLTK comes with a list of 179 english stopwords.

In [None]:
nltk.download('stopwords')

In [None]:
# using nltk.corpus.stopwords.words('english') to remove stop words
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

In [None]:
print(stopwords)

In [None]:
# only keep words that do not contain stopwords
first_text_list_cleaned = [word for word in first_text_list if word.lower() not in stopwords]

print(first_text_list_cleaned)
print("="*90)
print("Length of original list: {0} words\n"
     "Length of list after stopwords removal: {1} words"
     .format(len(first_text_list), len(first_text_list_cleaned)))

In [None]:
# same as [word for word in first_text_list if word.lower() not in stopwords]:
first_text_list_cleaned = []

for word in first_text_list:
    if word.lower() not in stopwords:
        first_text_list_cleaned.append(word)
        
print(first_text_list_cleaned)

### Stemming and Lemmatization
After removal of stopwords, the next stage of NLP that I would like to introduce is the process of Stemming. The work at this stage attempts to reduce as many different variations of similar words into a single term ( different branches all reduced to single word stem). Therefore if we have "running", "runs" and "run", we would really want these three distinct words to collapse into just the word "run". (However of course we lose the past, present or future tense).

In [None]:
stemmer = nltk.stem.PorterStemmer()

In [None]:
print("The stemmed form of running is: {}".format(stemmer.stem("running")))
print("The stemmed form of runs is : {}".format(stemmer.stem("runs")))
print("The stemmed form of run is : {}".format(stemmer.stem("run")))

In [None]:
print("The stemmed form of leaves is: {}".format(stemmer.stem("leaves")))

In [None]:
nltk.download('wordnet')

In [None]:
from nltk.stem import WordNetLemmatizer

# define lemm, and use lemm.lemmatize() to do lemmatization
lemm = WordNetLemmatizer()
print("The lemmatized form of leaves is: {}".format(lemm.lemmatize("leaves")))

The lemmatizer is working; making the words much more lexical sense.

### Vectorization

A machine can read in bits and numbers and therefore we will first need to convert our text into numbers (Machine learning algorithms operate on a numeric feature space) for which we utilise a very common approach known as the Bag-of-Words.

***The Bag of Words approach*** -
This approach uses the counts of words and records the occurrence of each word. For example given these two sentences "I love to eat Burgers", "I love to eat Fries", we first tokenize to obtain our vocabulary of 6 words from which we can get the word counts for - [I, love, to, eat, Burgers, Fries].

Each word is a feature and each row is a sentence.

In [None]:
# CountVectorizer(): Convert a collection of text documents to a matrix of token counts
# Defining our sentence
sentence = ["I love to eat Burgers",
           "I love to eat Fries"]

vectorizer = CountVectorizer(stop_words= "english") 

# fit
sentence_transform = vectorizer.fit_transform(sentence)

In [None]:
print("The features are: \n {}".format(vectorizer.get_feature_names_out())) # \n: add new line
print("The vectorized array looks like: \n {}".format(sentence_transform.toarray()))

First row: 1 burger, 1 eat, 0 fries, 1 love

Second row: 0 burger, 1 eat, 1 fries, 1 love

### Putting all the preprocessing steps together
Do not need to go through all the steps in tokenization, stopword removals, stemming/lemmatizing, and vectorization.

Sklearn's tokenizer discards all single character terms like ('a', 'w' etc) and also lower cases all terms by default. Filtering out stopwords in Sklearn is as convenient as passing the value 'english' into the argument "stop_words" where a built-in English stopword list is automatically used.

Unfortunately, there is no built-in lemmatizer in the vectorizer while we can extend the CountVectorizer class by overwriting the "build_analyzer" method as follows:

In [None]:
from nltk.stem import WordNetLemmatizer

# define lemm to do lemmatization
lemm = WordNetLemmatizer()

# define a class to extend the CountVectorizer class with a lemmatizer
class LemmaCountVectorizer(CountVectorizer):

    # build_analyzer(): Return a callable to process input data. 
    # The callable handles that handles preprocessing, tokenization, and n-grams generation.
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()   # self=countvectorizer
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))
        
#  super() method lets you access methods in a parent class. You can think of super() as a way to jump up to 
# view the methods in the class from which another class is inherited.

In [None]:
text = ["I love to eat Burgers", "I love to eat Fries"]

tf_vectorizer = LemmaCountVectorizer(stop_words = "english")

tf = tf_vectorizer.fit_transform(text)

print(text)
print("="*90)

feature_names = tf_vectorizer.get_feature_names_out()
print(feature_names)
print("="*90)

tf_dense = tf.toarray()
print("Vectors:\n", tf_dense)
print("="*90)



In [None]:
# get entire text in a list
text = list(data['text'])

# calling the overwritten Count vectorizer
tf_vectorizer = LemmaCountVectorizer(stop_words = "english")
tf = tf_vectorizer.fit_transform(text)

In [None]:
print(text)
print("="*90)

feature_names = tf_vectorizer.get_feature_names_out()
print(feature_names)
print("="*90)

tf_dense = tf.toarray()
print("Vectors:\n", tf_dense)
print("="*90)

