# Bag of Words

In [1]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Read data
df = pd.read_csv("/content/amazon.txt", delimiter='\t', header=None)
df.columns = ["Review_text", "Review_class"]

In [3]:
# Print a head of data
df.head(10)

Unnamed: 0,Review_text,Review_class
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up...,0
6,If you have several dozen or several hundred c...,0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


In [4]:
# Download data from NLTK Downloader
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Clean Review_text column
def clean_text_alternative(df):
    def process_text(text):
        # Convert text into lower case
        text = text.lower()
        # Remove links
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        # Remove punctuation
        text = re.sub(r'[,.\"!@#$%^&*(){}?/;`~:<>+=-]', '', text)
        # Make tokenaization
        tokens = word_tokenize(text)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # Ignore anything not alphabetic
        words = [word for word in stripped if word.isalpha()]
        # define stop words
        stop_words = set(stopwords.words('english'))
        # Discard "not" from stop words
        stop_words.discard('not')
        # Apply stemming to return the word into base word
        ps = PorterStemmer()
        words = [ps.stem(w) for w in words if w not in stop_words]
        return ' '.join(words)

    return [process_text(text) for text in df['Review_text'].values.tolist()]


In [6]:
# Call the function
cleaned_reviews = clean_text_alternative(df)
print(cleaned_reviews[0:5])

['way plug us unless go convert', 'good case excel valu', 'great jawbon', 'tie charger convers last minutesmajor problem', 'mic great']


In [7]:
# Initialize CountVectorizer with min_df=3
CV = TfidfVectorizer(min_df=3)
# Fit and transform the documents
X = CV.fit_transform(cleaned_reviews).toarray()
# Get target
y = df["Review_class"].values
# Get feature names (terms)
feature_names = CV.get_feature_names_out()
# Display the transformed data and feature names
print("Transformed Data:")
print(X)
print("\nFeature Names:")
print(feature_names)

Transformed Data:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Feature Names:
['abl' 'absolut' 'accept' 'access' 'actual' 'addit' 'ago' 'allow' 'almost'
 'also' 'alway' 'amaz' 'amazon' 'anoth' 'anyon' 'anyth' 'appear' 'area'
 'around' 'arriv' 'audio' 'avoid' 'aw' 'away' 'awesom' 'back' 'bad' 'bar'
 'bargain' 'basic' 'batteri' 'beauti' 'belt' 'best' 'better' 'big' 'black'
 'blue' 'bluetooth' 'book' 'bought' 'break' 'broke' 'bt' 'button' 'buy'
 'ca' 'cabl' 'call' 'came' 'camera' 'car' 'care' 'carri' 'case' 'caus'
 'cell' 'charg' 'charger' 'charm' 'cheap' 'choic' 'cingular' 'clariti'
 'clear' 'clip' 'color' 'come' 'comfort' 'compani' 'complaint' 'complet'
 'comput' 'connect' 'construct' 'contact' 'convers' 'cool' 'cost' 'could'
 'coupl' 'cover' 'crap' 'current' 'custom' 'cut' 'data' 'day' 'dead'
 'deal' 'decent' 'decis' 'defect' 'definit' 'describ' 'descript' 'design'
 'despit' 

In [8]:
# Print shape of "X" and "Y"
print(np.shape(X))
print(np.shape(y))

(1000, 428)
(1000,)


In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [10]:
# Building the model
model = DecisionTreeClassifier(criterion="entropy", random_state=41)

In [11]:
# Train the model
model.fit(X_train,y_train)
# Make prediction
y_pred = model.predict(X_test)
# Print Accuracy
print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
# Print F1-score
print(f"F1-score :{f1_score(y_test, y_pred)}")
# Print Precision
print(f"Precision : {precision_score(y_test, y_pred)}")

Accuracy : 0.8
F1-score :0.7849462365591396
Precision : 0.8902439024390244
