In [13]:
import nltk # natural language toolkit
import re # Regula expression
import numpy as np
import pandas as pd
import warnings
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize

warnings.filterwarnings("ignore")

In [2]:
paragraph = """
Samurai (侍) or bushi (武士, [bɯ.ɕi]) were members of the warrior class in Japan. They were originally provincial warriors who served the Kuge and imperial court in the late 12th century. Samurai eventually came to play a major political role until their abolition in the late 1870s during the Meiji era.[1][2]

In the Heian period, powerful regional clans were relied on to put down rebellions. After power struggles, the Taira clan defeated the Minamoto clan in 1160.[3] After the Minamoto defeated the Taira in 1185, Minamoto no Yoritomo established the Kamakura shogunate, a parallel government that did not supplant the imperial court.[4][5] The warriors who served the Shogunate were called gokenin, landholding warriors whose retainers were called samurai.[6][7] Gokenin were regulated by the Samurai-dokoro.
"""

In [3]:
# initialise necessary objects

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [4]:
sentences = sent_tokenize(paragraph)

In [5]:
sentences

['\nSamurai (侍) or bushi (武士, [bɯ.ɕi]) were members of the warrior class in Japan.',
 'They were originally provincial warriors who served the Kuge and imperial court in the late 12th century.',
 'Samurai eventually came to play a major political role until their abolition in the late 1870s during the Meiji era.',
 '[1][2]\n\nIn the Heian period, powerful regional clans were relied on to put down rebellions.',
 'After power struggles, the Taira clan defeated the Minamoto clan in 1160.',
 '[3] After the Minamoto defeated the Taira in 1185, Minamoto no Yoritomo established the Kamakura shogunate, a parallel government that did not supplant the imperial court.',
 '[4][5] The warriors who served the Shogunate were called gokenin, landholding warriors whose retainers were called samurai.',
 '[6][7] Gokenin were regulated by the Samurai-dokoro.']

In [6]:
# Stopwords handling

try:
    stop_words = set(stopwords.words("english"))
except:
    nltk.download("stopwords")
    stop_words = set(stopwords.words("english"))

In [10]:
# Preprocessing text

corpus = []

for sentence in sentences:
    review = re.sub(r'[^a-zA-Z]', ' ', sentence)  # Remove non-alphabetic characters
    review = review.lower()  # Convert to lowercase
    review = review.split()  # Tokenize words
    
    # Stemming
    review = [ps.stem(word) for word in review if word not in stop_words]
    
    # Join words back into a single string
    review = ' '.join(review)
    
    corpus.append(review)

In [11]:
# Creating bag of words model

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

In [12]:
# Model output

print("Output from bag of words model is:\n ", X[ :3])

Output from bag of words model is:
  [[0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0
  0 0 0 0 0 1 0 0 0 0 1 1 0 0]
 [1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0
  0 0 0 1 1 0 0 0 0 0 0 0 0 0]]


## Bag of words using movie reviews

In [1]:
review_1 = "The movie was good and we really liked it"
review_2 = "The movie was good but the ending was boring"
review_3 = "We did not like the movie as it was too lengthy"

In [4]:
review_1_tokens = word_tokenize(review_1)
review_2_tokens = word_tokenize(review_2)
review_3_tokens = word_tokenize(review_3)

In [5]:
review_1_tokens

['The', 'movie', 'was', 'good', 'and', 'we', 'really', 'liked', 'it']

In [6]:
review_tokens = set(review_1_tokens).union(set(review_2_tokens)).union(set(review_3_tokens))
len(review_tokens)

20

In [7]:
review_tokens

{'The',
 'We',
 'and',
 'as',
 'boring',
 'but',
 'did',
 'ending',
 'good',
 'it',
 'lengthy',
 'like',
 'liked',
 'movie',
 'not',
 'really',
 'the',
 'too',
 'was',
 'we'}

In [8]:
#Converting details to dictionary

review_1_dict = dict.fromkeys(review_tokens, 0)
review_1_dict

{'lengthy': 0,
 'ending': 0,
 'we': 0,
 'We': 0,
 'good': 0,
 'too': 0,
 'as': 0,
 'but': 0,
 'boring': 0,
 'like': 0,
 'The': 0,
 'it': 0,
 'really': 0,
 'liked': 0,
 'did': 0,
 'and': 0,
 'the': 0,
 'was': 0,
 'movie': 0,
 'not': 0}

In [9]:
for token in review_1_tokens:
    review_1_dict[token] += 1
review_1_dict

{'lengthy': 0,
 'ending': 0,
 'we': 1,
 'We': 0,
 'good': 1,
 'too': 0,
 'as': 0,
 'but': 0,
 'boring': 0,
 'like': 0,
 'The': 1,
 'it': 1,
 'really': 1,
 'liked': 1,
 'did': 0,
 'and': 1,
 'the': 0,
 'was': 1,
 'movie': 1,
 'not': 0}

In [10]:
review_2_dict = dict.fromkeys(review_tokens, 0)
for token in review_2_tokens:
    review_2_dict[token] += 1
review_2_dict

{'lengthy': 0,
 'ending': 1,
 'we': 0,
 'We': 0,
 'good': 1,
 'too': 0,
 'as': 0,
 'but': 1,
 'boring': 1,
 'like': 0,
 'The': 1,
 'it': 0,
 'really': 0,
 'liked': 0,
 'did': 0,
 'and': 0,
 'the': 1,
 'was': 2,
 'movie': 1,
 'not': 0}

In [11]:
review_3_dict = dict.fromkeys(review_tokens, 0)
for token in review_3_tokens:
    review_3_dict[token] += 1
review_3_dict

{'lengthy': 1,
 'ending': 0,
 'we': 0,
 'We': 1,
 'good': 0,
 'too': 1,
 'as': 1,
 'but': 0,
 'boring': 0,
 'like': 1,
 'The': 0,
 'it': 1,
 'really': 0,
 'liked': 0,
 'did': 1,
 'and': 0,
 'the': 1,
 'was': 1,
 'movie': 1,
 'not': 1}

In [14]:
reviews_dictionary_df = pd.DataFrame([review_1_dict, review_2_dict, review_3_dict])
reviews_dictionary_df

Unnamed: 0,lengthy,ending,we,We,good,too,as,but,boring,like,The,it,really,liked,did,and,the,was,movie,not
0,0,0,1,0,1,0,0,0,0,0,1,1,1,1,0,1,0,1,1,0
1,0,1,0,0,1,0,0,1,1,0,1,0,0,0,0,0,1,2,1,0
2,1,0,0,1,0,1,1,0,0,1,0,1,0,0,1,0,1,1,1,1


# Count vectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
review_list = [review_1, review_2, review_3]

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(review_list)

In [16]:
X_counts

<3x18 sparse matrix of type '<class 'numpy.int64'>'
	with 27 stored elements in Compressed Sparse Row format>

In [17]:
X_names = count_vect.get_feature_names_out()
X_names

array(['and', 'as', 'boring', 'but', 'did', 'ending', 'good', 'it',
       'lengthy', 'like', 'liked', 'movie', 'not', 'really', 'the', 'too',
       'was', 'we'], dtype=object)

In [18]:
a = pd.DataFrame(X_counts.toarray(), columns = X_names)
a

Unnamed: 0,and,as,boring,but,did,ending,good,it,lengthy,like,liked,movie,not,really,the,too,was,we
0,1,0,0,0,0,0,1,1,0,0,1,1,0,1,1,0,1,1
1,0,0,1,1,0,1,1,0,0,0,0,1,0,0,2,0,2,0
2,0,1,0,0,1,0,0,1,1,1,0,1,1,0,1,1,1,1
