In [1]:
#!pip install nltk
import nltk
#nltk.download("all")

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import re

In [4]:
df = pd.read_csv("C:/Users/harik/python pratice/00 - datasets-20251216T065029Z-1-001/00 - datasets/amazonreviews.tsv", sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [5]:
df.loc[0]['review']

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [6]:
df['label'].value_counts()

label
neg    5097
pos    4903
Name: count, dtype: int64

#### a. Convert text to to lower case

In [7]:
df['review'] = df['review'].str.lower()

#### b. Remove Stop Words

In [8]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


def remove_stopwords(x):
    lst = nltk.word_tokenize(x)
    new_lst = [x for x in lst if x not in stop_words]
    return ' '.join(new_lst)

In [9]:
df['review'] = df['review'].apply(remove_stopwords)
df['review']

0       stuning even non-gamer : sound track beautiful...
1       best soundtrack ever anything . : 'm reading l...
2       amazing ! : soundtrack favorite music time , h...
3       excellent soundtrack : truly like soundtrack e...
4       remember , pull jaw floor hearing : 've played...
                              ...                        
9995    revelation life small town america early 1900s...
9996    great biography interesting journalist : biogr...
9997    interesting subject ; poor presentation : 'd h...
9998    n't buy : box looked used obviously new . trie...
9999    beautiful pen fast delivery . : pen shipped pr...
Name: review, Length: 10000, dtype: object

#### c. Remove all punctuation characters defined in puncts from every text entry in df['review'].

In [10]:
puncts = string.punctuation
puncts

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

+ format(puncts) - inserts the contents of puncts into the square brackets
+ Square brackets [] match any single character inside them.
+ So [abc] matches ‘a’ or ‘b’ or ‘c’.

In [11]:
df['review'] = df['review'].str.replace('[{}]'.format(puncts),'', regex=True)
df['review']

0       stuning even nongamer  sound track beautiful  ...
1       best soundtrack ever anything   m reading lot ...
2       amazing   soundtrack favorite music time  hand...
3       excellent soundtrack  truly like soundtrack en...
4       remember  pull jaw floor hearing  ve played ga...
                              ...                        
9995    revelation life small town america early 1900s...
9996    great biography interesting journalist  biogra...
9997    interesting subject  poor presentation  d hard...
9998    nt buy  box looked used obviously new  tried c...
9999    beautiful pen fast delivery   pen shipped prom...
Name: review, Length: 10000, dtype: object

#### Example of removing urls

In [12]:
text = "Visit https://example.com or http://abc.com for more info. Also check www.site.org"
re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE) 

'Visit  or  for more info. Also check '

In [13]:
def clean_text(text):
    # Special characters and emojis
    text = re.sub(r'[^\w\s]', '', text) 
    # Any substring starting with http, https, or www followed by characters until a space is removed.
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE) 
    return text

In [14]:
df['review'] = df['review'].apply(clean_text)
df['review']

0       stuning even nongamer  sound track beautiful  ...
1       best soundtrack ever anything   m reading lot ...
2       amazing   soundtrack favorite music time  hand...
3       excellent soundtrack  truly like soundtrack en...
4       remember  pull jaw floor hearing  ve played ga...
                              ...                        
9995    revelation life small town america early 1900s...
9996    great biography interesting journalist  biogra...
9997    interesting subject  poor presentation  d hard...
9998    nt buy  box looked used obviously new  tried c...
9999    beautiful pen fast delivery   pen shipped prom...
Name: review, Length: 10000, dtype: object

#### c. Lemmatize the text

In [15]:
from nltk.stem import WordNetLemmatizer

In [16]:
lem  = WordNetLemmatizer()

In [17]:
def lemmatize(x):
    lem_lst = []
    lst = x.split()
    for word in lst:
        lem_lst.append(lem.lemmatize(word))
    return ' '.join(lem_lst)          

In [18]:
df['review'] = df['review'].apply(lemmatize)
df['review']

0       stuning even nongamer sound track beautiful pa...
1       best soundtrack ever anything m reading lot re...
2       amazing soundtrack favorite music time hand in...
3       excellent soundtrack truly like soundtrack enj...
4       remember pull jaw floor hearing ve played game...
                              ...                        
9995    revelation life small town america early 1900s...
9996    great biography interesting journalist biograp...
9997    interesting subject poor presentation d hardpr...
9998    nt buy box looked used obviously new tried con...
9999    beautiful pen fast delivery pen shipped prompt...
Name: review, Length: 10000, dtype: object

#### e. Encode categorical target

In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
le = LabelEncoder()
df['label']  = le.fit_transform(df['label'])
df

Unnamed: 0,label,review
0,1,stuning even nongamer sound track beautiful pa...
1,1,best soundtrack ever anything m reading lot re...
2,1,amazing soundtrack favorite music time hand in...
3,1,excellent soundtrack truly like soundtrack enj...
4,1,remember pull jaw floor hearing ve played game...
...,...,...
9995,1,revelation life small town america early 1900s...
9996,1,great biography interesting journalist biograp...
9997,0,interesting subject poor presentation d hardpr...
9998,0,nt buy box looked used obviously new tried con...


In [21]:
df.shape

(10000, 2)

In [22]:
x = df['review']
y = df['label']

In [23]:
x.head()

0    stuning even nongamer sound track beautiful pa...
1    best soundtrack ever anything m reading lot re...
2    amazing soundtrack favorite music time hand in...
3    excellent soundtrack truly like soundtrack enj...
4    remember pull jaw floor hearing ve played game...
Name: review, dtype: object

In [24]:
x.info()

<class 'pandas.core.series.Series'>
RangeIndex: 10000 entries, 0 to 9999
Series name: review
Non-Null Count  Dtype 
--------------  ----- 
10000 non-null  object
dtypes: object(1)
memory usage: 78.3+ KB


### Using Classification

#### 1. Extract Features from text 

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape 

((8000,), (2000,), (8000,), (2000,))

In [27]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_train)
X

<8000x29467 sparse matrix of type '<class 'numpy.float64'>'
	with 277742 stored elements in Compressed Sparse Row format>

In [28]:
xtrain_new = X.toarray()
xtrain_new.shape

(8000, 29467)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [30]:
model = LogisticRegression()
model.fit(xtrain_new,y_train)

In [31]:
yhat_train = model.predict(xtrain_new)
yhat_train

array([1, 0, 1, ..., 0, 0, 0])

In [32]:
cm = confusion_matrix(y_train,yhat_train)
cm

array([[3770,  290],
       [ 259, 3681]], dtype=int64)

In [33]:
print(classification_report(y_train,yhat_train))

              precision    recall  f1-score   support

           0       0.94      0.93      0.93      4060
           1       0.93      0.93      0.93      3940

    accuracy                           0.93      8000
   macro avg       0.93      0.93      0.93      8000
weighted avg       0.93      0.93      0.93      8000



In [34]:
xtest_new = vectorizer.transform(X_test).toarray()
xtest_new.shape

(2000, 29467)

In [35]:
yhat_test = model.predict(xtest_new)
yhat_test

array([0, 0, 1, ..., 1, 1, 1])

In [36]:
cm = confusion_matrix(y_test,yhat_test)
cm

array([[880, 157],
       [157, 806]], dtype=int64)

In [37]:
print(classification_report(y_test,yhat_test))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      1037
           1       0.84      0.84      0.84       963

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



#### 2. Creating a pipeline

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [39]:
sent_ppl = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('lreg',LogisticRegression(max_iter=400))
])

In [40]:
sent_ppl.fit(x,y)

In [41]:
sent_ppl.predict(X_train)

array([1, 0, 1, ..., 0, 0, 0])

#### 3. Make Predictions

In [42]:
text = 'The product is amazing.Worth every penny'
sent_ppl.predict([text])

array([1])

In [43]:
text = 'Please dont buy this product. Utter waste of money'
sent_ppl.predict([text])

array([0])

In [44]:
sent_ppl.score(x,y)

0.9306

In [45]:
rf_ppl = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('rfc',RandomForestClassifier(n_estimators=100, max_depth=8))
])

In [46]:
rf_ppl.fit(x,y)

In [47]:
rf_ppl.predict(x)

array([1, 1, 1, ..., 0, 0, 1])

In [48]:
rf_ppl.score(x,y)

0.855

#### Using NLTK

In [49]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Instantiate the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [50]:
df.head()

Unnamed: 0,label,review
0,1,stuning even nongamer sound track beautiful pa...
1,1,best soundtrack ever anything m reading lot re...
2,1,amazing soundtrack favorite music time hand in...
3,1,excellent soundtrack truly like soundtrack enj...
4,1,remember pull jaw floor hearing ve played game...


In [51]:
analyzer.polarity_scores('This is a great movie')

{'neg': 0.0, 'neu': 0.423, 'pos': 0.577, 'compound': 0.6249}

In [52]:
analyzer.polarity_scores('This is a great movie')['compound']

0.6249

In [53]:
# Create a new column for sentiment
df['Score'] = df['review'].apply(lambda x: 1 if analyzer.polarity_scores(x)['compound'] > 0 else 0)
df.head()

Unnamed: 0,label,review,Score
0,1,stuning even nongamer sound track beautiful pa...,1
1,1,best soundtrack ever anything m reading lot re...,1
2,1,amazing soundtrack favorite music time hand in...,1
3,1,excellent soundtrack truly like soundtrack enj...,1
4,1,remember pull jaw floor hearing ve played game...,1


In [54]:
confusion_matrix(df['label'],df['Score'])

array([[2212, 2885],
       [ 407, 4496]], dtype=int64)

In [55]:
print(classification_report(df['label'],df['Score']))

              precision    recall  f1-score   support

           0       0.84      0.43      0.57      5097
           1       0.61      0.92      0.73      4903

    accuracy                           0.67     10000
   macro avg       0.73      0.68      0.65     10000
weighted avg       0.73      0.67      0.65     10000

