# Part 1: Text Collection and Loading

In [1]:
# I have collected and load a dataset from a kaggle containing ecommerce-reviews for processing.
import pandas as pd

df = pd.read_csv("TestReviews.csv")
df.head()

Unnamed: 0,review,class
0,Fantastic spot for an even or a quite cocktail...,1
1,"Love, love, love the calamari. It's so good an...",1
2,"Love this place. Stiff martinis and cocktails,...",1
3,It's everything a great cocktail bar should be...,1
4,"I came here before a pirates game, so it was a...",1


# Part 2: Text Preprocessing

In [3]:
# import natual language toolkit
import nltk

# The Gutenberg Corpus is a collection of texts compiled by Project Gutenberg1. Project Gutenberg is an organization that aims to digitize and make available public domain books. The corpus contains over 53,000 books in English, German, French, Spanish, and other languages

# Load the Gutenberg corpus
corpus = nltk.corpus.gutenberg.raw()

### Tokenization

In [4]:
# Tokenization: Split the text into words and sentences. I have splited Gutenberg Corpus into both words and sentences for better understanding 
words = nltk.tokenize.word_tokenize(corpus)
sentences = nltk.tokenize.sent_tokenize(corpus)

In [5]:
print(words[:10])
print(sentences[:2])

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER']
['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.', "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."]


### Stemming

In [9]:
# Stemming: Reduce words to their root form using Porter Stemmer. Basically, Stemming is a hard-cut. It cut the words from the end and does not look at the stemed words wheither they have context or not.
# In other words, Stemming is crude heuristic that chop off the words from the end in the hope of achieving the goal correctly.
#  The resulting stem may not always be a valid word. It can produce invalid words.

# i have used a porter's algorithm for stemming 
porter = nltk.stem.PorterStemmer()
stemmed_words = [porter.stem(word) for word in words]

In [10]:
print(stemmed_words[:40])

['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter', 'i', 'emma', 'woodhous', ',', 'handsom', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfort', 'home', 'and', 'happi', 'disposit', ',', 'seem', 'to', 'unit', 'some', 'of', 'the', 'best', 'bless', 'of', 'exist', ';']


### Lemmatization

In [12]:
# Lemmatization: Reduce the words by considering their context
# In other words, Lemmatization uses a vocabulary and morphological analysis of words to return the base or dictionary form of a word which is known as the lemma.

lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

In [13]:
print(lemmatized_words[:40])

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessing', 'of', 'existence', ';']


### Stop Word Removal

In [14]:
# Stop Word Removal: Eliminate common words that may not be useful for analysis
stop_words = set(nltk.corpus.stopwords.words('english'))
filtered_words = [word for word in lemmatized_words if word.lower() not in stop_words]

In [15]:
print(filtered_words[:40])

['[', 'Emma', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'CHAPTER', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'rich', ',', 'comfortable', 'home', 'happy', 'disposition', ',', 'seemed', 'unite', 'best', 'blessing', 'existence', ';', 'lived', 'nearly', 'twenty-one', 'year', 'world', 'little', 'distress', 'vex', '.', 'wa', 'youngest', 'two']


# Part 3: Feature Extraction Techniques

### Bag Of Words(BoW)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

# In bag of words, we convert the text to numeric data. Here is the process for text to bag of words conversion
# Step 1: First we make list of unique words from all documents
# Step 2: Create a vectors for the document

# Given corpus
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the corpus
bow_matrix = vectorizer.fit_transform(corpus)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert bow_matrix to a DataFrame
bow_df = pd.DataFrame.sparse.from_spmatrix(bow_matrix, columns=feature_names)

# Display the DataFrame
print(bow_df)


   and  document  first  is  one  second  the  third  this
0    0         1      1   1    0       0    1      0     1
1    0         2      0   1    0       1    1      0     1
2    1         0      0   1    1       0    1      1     1
3    0         1      1   1    0       0    1      0     1


### TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF stands for term frequency - inverse document frequency. 
# TF-IDF is used to evaluate how important a word is to a document in corpus.
# Term Frequency is calculated as No. of term appears in a document/Total no. of terms in the document
# Inverse Document Frequency IDF is calculated as log(total no. of documents/ no. of documents with term in it.)
# Finally, TF-IDF will be TF * IDF

# Initialize TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = vectorizer_tfidf.fit_transform(corpus)

# Get the feature names (words)
feature_names_tfidf = vectorizer_tfidf.get_feature_names_out()

# Convert tfidf_matrix to a DataFrame
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=feature_names_tfidf)

# Display the DataFrame
print(tfidf_df)

        and  document     first        is       one    second       the  \
0         0  0.469791  0.580286  0.384085         0         0  0.384085   
1         0  0.687624         0  0.281089         0  0.538648  0.281089   
2  0.511849         0         0  0.267104  0.511849         0  0.267104   
3         0  0.469791  0.580286  0.384085         0         0  0.384085   

      third      this  
0         0  0.384085  
1         0  0.281089  
2  0.511849  0.267104  
3         0  0.384085  
