In [1]:
# Part 1: Text Collection and Loading
import pandas as pd

df = pd.read_csv("TestReviews.csv")
df.head()

Unnamed: 0,review,class
0,Fantastic spot for an even or a quite cocktail...,1
1,"Love, love, love the calamari. It's so good an...",1
2,"Love this place. Stiff martinis and cocktails,...",1
3,It's everything a great cocktail bar should be...,1
4,"I came here before a pirates game, so it was a...",1


In [2]:
# Part 2: Text Preprocessing
import nltk

# Load the Gutenberg corpus
corpus = nltk.corpus.gutenberg.raw()

In [3]:
# Tokenization: Split the text into words and sentences
words = nltk.tokenize.word_tokenize(corpus)
sentences = nltk.tokenize.sent_tokenize(corpus)

In [4]:
print(words[:10])
print(sentences[:2])

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER']
['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.', "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."]


In [16]:
# tokeniztion on dataset
reviews_words = df['review'].apply(lambda x: nltk.tokenize.word_tokenize(x))

In [17]:
print(reviews_words[:30])

0     [Fantastic, spot, for, an, even, or, a, quite,...
1     [Love, ,, love, ,, love, the, calamari, ., It,...
2     [Love, this, place, ., Stiff, martinis, and, c...
3     [It, 's, everything, a, great, cocktail, bar, ...
4     [I, came, here, before, a, pirates, game, ,, s...
5     [Olive, or, Twist, is, the, historic, site, of...
6     [A, beautiful, little, bar, with, an, exciting...
7     [My, favorite, bar, in, town, love, the, live,...
8     [The, location, is, in, a, strip, mall, ,, but...
9     [THIS, PLACE, IS, OPEN, !, The, best, food, an...
10    [We, ca, n't, say, enough, good, things, about...
11    [My, husband, and, I, want, to, give, six, sta...
12    [I, 'm, still, laughing, reading, poor, Maribe...
13    [My, husband, and, I, have, been, here, severa...
14    [Very, ,, very, good, ., Top, notch, calamari,...
15    [Saturday, night, at, Salutes, is, a, real, tr...
16    [Went, with, family, last, night, ..., loved, ...
17    [I, 've, been, waiting, to, change, from, 

In [5]:
# Stemming: Reduce words to their root form using Porter Stemmer
porter = nltk.stem.PorterStemmer()
stemmed_words = [porter.stem(word) for word in words]

In [6]:
print(stemmed_words[:40])

['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter', 'i', 'emma', 'woodhous', ',', 'handsom', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfort', 'home', 'and', 'happi', 'disposit', ',', 'seem', 'to', 'unit', 'some', 'of', 'the', 'best', 'bless', 'of', 'exist', ';']


In [7]:
# Lemmatization: Further reduce the stemmed words by considering their context
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]

In [8]:
print(lemmatized_words[:40])

['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter', 'i', 'emma', 'woodhous', ',', 'handsom', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfort', 'home', 'and', 'happi', 'disposit', ',', 'seem', 'to', 'unit', 'some', 'of', 'the', 'best', 'bless', 'of', 'exist', ';']


In [9]:
# Stop Word Removal: Eliminate common words that may not be useful for analysis
stop_words = set(nltk.corpus.stopwords.words('english'))
filtered_words = [word for word in lemmatized_words if word.lower() not in stop_words]

In [10]:
print(filtered_words[:40])

['[', 'emma', 'jane', 'austen', '1816', ']', 'volum', 'chapter', 'emma', 'woodhous', ',', 'handsom', ',', 'clever', ',', 'rich', ',', 'comfort', 'home', 'happi', 'disposit', ',', 'seem', 'unit', 'best', 'bless', 'exist', ';', 'live', 'nearli', 'twenty-on', 'year', 'world', 'veri', 'littl', 'distress', 'vex', '.', 'wa', 'youngest']


In [11]:
# Part 3: Feature Extraction Techniques
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the preprocessed text
bow_matrix = vectorizer.fit_transform(sentences)

In [12]:
print(bow_matrix[:30])

  (0, 12467)	2
  (0, 5692)	1
  (0, 20392)	1
  (0, 3057)	1
  (0, 142)	1
  (0, 40378)	1
  (0, 6548)	1
  (0, 41522)	1
  (0, 17060)	1
  (0, 7225)	1
  (0, 1938)	3
  (0, 31001)	1
  (0, 41395)	2
  (0, 7620)	1
  (0, 18068)	1
  (0, 17114)	1
  (0, 11060)	1
  (0, 32391)	1
  (0, 37449)	2
  (0, 39016)	1
  (0, 34271)	1
  (0, 25584)	2
  (0, 36954)	2
  (0, 4210)	1
  (0, 4627)	1
  :	:
  (28, 17684)	1
  (28, 40697)	1
  (28, 13997)	1
  (28, 14871)	1
  (28, 5641)	1
  (28, 20241)	3
  (28, 25255)	1
  (28, 26111)	1
  (28, 3734)	1
  (28, 8716)	1
  (28, 6509)	1
  (28, 23438)	1
  (28, 33427)	1
  (28, 41374)	1
  (28, 18973)	1
  (28, 37057)	1
  (28, 37315)	1
  (28, 3183)	1
  (28, 22668)	1
  (28, 24860)	1
  (28, 6689)	1
  (29, 17936)	1
  (29, 34668)	1
  (29, 30574)	1
  (29, 35980)	1


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer()

# Fit and transform the preprocessed text
tfidf_matrix = vectorizer_tfidf.fit_transform(sentences)


In [14]:
print(tfidf_matrix[:30])

  (0, 12467)	0.2633509409553818
  (0, 5692)	0.08253777661692062
  (0, 20392)	0.157123172592659
  (0, 3057)	0.25445404658144466
  (0, 142)	0.2703876611028345
  (0, 40378)	0.20738302135122805
  (0, 6548)	0.15212734216584745
  (0, 41522)	0.15590662700469524
  (0, 17060)	0.17514798691534425
  (0, 7225)	0.18646871459695763
  (0, 1938)	0.11627893229136303
  (0, 31001)	0.16101636507084807
  (0, 41395)	0.13338724591922949
  (0, 7620)	0.17869150202939774
  (0, 18068)	0.13715167236260126
  (0, 17114)	0.1413987255440509
  (0, 11060)	0.18738223930774586
  (0, 32391)	0.12638687171944016
  (0, 37449)	0.09693120686172989
  (0, 39016)	0.22119321834123656
  (0, 34271)	0.10646721774016332
  (0, 25584)	0.08762210878340138
  (0, 36954)	0.07228283336204903
  (0, 4210)	0.14037141667846958
  (0, 4627)	0.21424458315278766
  :	:
  (28, 17684)	0.12637657873555036
  (28, 40697)	0.10116224796417972
  (28, 13997)	0.16972240666997865
  (28, 14871)	0.09874726052166502
  (28, 5641)	0.10544400994338628
  (28, 20241)	0