In [None]:
!pip install spacy==3.0
!pip install nltk

In [12]:
import pandas as pd
from sklearn import preprocessing
import nltk

In [9]:
import pandas as pd


## Dataset is about amazon review
url = 'https://github.com/Git-PratikVyas/NLP-TextAnalysis/blob/main/data/amazon_review.json.gz?raw=true'
df = pd.read_json(url, compression='gzip',encoding='utf-8',lines=True)
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewText,summary,unixReviewTime
0,1,True,"03 12, 2018",A3QY3THQ42WSCQ,B000YFSR5G,Waaaay too BIG,One Star,1520812800
1,1,True,"03 12, 2018",A3QY3THQ42WSCQ,B000YFSR4W,Waaaay too BIG,One Star,1520812800
2,1,True,"02 8, 2017",A21HH0VIBKK80J,B000YFSR5G,"Was terribly disappointed, the pants were way ...","Was terribly disappointed, the pants were way ...",1486512000
3,1,True,"02 8, 2017",A21HH0VIBKK80J,B000YFSR4W,"Was terribly disappointed, the pants were way ...","Was terribly disappointed, the pants were way ...",1486512000
4,1,True,"02 19, 2018",A276HQXYS553QW,B0014F8TIU,Constantly rolls down,One Star,1518998400


In [10]:
df = df.drop(columns=['reviewTime','unixReviewTime']) ###
df = df.rename(columns={'reviewText': 'text'}) ###
df.sample(5, random_state=12)

Unnamed: 0,overall,verified,reviewerID,asin,text,summary
163807,5,False,A2A8GHFXUG1B28,B0045Z4JAI,Good Decaf... it has a good flavour for a deca...,Nice!
195640,5,True,A1VU337W6PKAR3,B00K0TIC56,I could not ask for a better system for my sma...,I could not ask for a better system for my sma...
167820,4,True,A1Z5TT1BBSDLRM,B0012ORBT6,good product at a good price and saves a trip ...,Four Stars
104268,1,False,A4PRXX2G8900X,B005SPI45U,I like the principle of a raw chip - something...,No better alternatives but still tastes bad.
51961,1,True,AYETYLNYDIS2S,B00D1HLUP8,"Fake China knockoff, you get what you pay for.",Definitely not OEM


# **Sentiment Analysis Using Lexicon**

In [13]:
##The bing lexicon categorizes words in a binary fashion into positive and negative categories
## It is like a dictionary that contains a collection of words with +ve and -ve sentiment score

nltk.download('opinion_lexicon')

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Unzipping corpora/opinion_lexicon.zip.


True

In [14]:
from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize

## lexicon is a dictionary that contains a collection of words with +ve and -ve sentiment score
print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))
print('Examples of positive words in opinion lexicon',
      opinion_lexicon.positive()[:5])
print('Examples of negative words in opinion lexicon',
      opinion_lexicon.negative()[:5])

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable']


In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
# create a +ve,-ve word dictionary.
##check dataset text in  dictionary and calculate snetimental score

df.rename(columns={"reviewText": "text"}, inplace=True)
pos_score = 1
neg_score = -1
word_dict = {}

# Adding the positive words to the dictionary
for word in opinion_lexicon.positive():
        word_dict[word] = pos_score
        
# Adding the negative words to the dictionary
for word in opinion_lexicon.negative():
        word_dict[word] = neg_score
        
def bing_liu_score(text):
    sentiment_score = 0
    bag_of_words = word_tokenize(text.lower())
    for word in bag_of_words:
        if word in word_dict:
            sentiment_score += word_dict[word]
    return sentiment_score / len(bag_of_words)

In [17]:
df['Bing_Liu_Score'] = df['text'].apply(bing_liu_score)
df[['asin','text','Bing_Liu_Score']].sample(2, random_state=0)

Unnamed: 0,asin,text,Bing_Liu_Score
188097,B00099QWOU,As expected,0.0
184654,B000RW1XO8,Works as designed...,0.25


In [18]:
## scale the score for each review between 1 and -1 and compute the average sentiment scores across all reviews for each type of star rating
## we would like to check whether the calculated score matches the expectation based on the rating provided by the customer.
## it is expected five-star rating would have a higher sentiment score than a review with a one-star rating

df['Bing_Liu_Score'] = preprocessing.scale(df['Bing_Liu_Score'])
df.groupby('overall').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
overall,Unnamed: 1_level_1
1,-0.58706
2,-0.426526
4,0.344637
5,0.529064


#**Sentimental analysis using Supervised ML LinearSVC**

## Load and Clean Data

In [23]:
## Dataset is about amazon review
url = 'https://github.com/Git-PratikVyas/NLP-TextAnalysis/blob/main/data/amazon_review.json.gz?raw=true'
df = pd.read_json(url, compression='gzip',encoding='utf-8',lines=True)
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewText,summary,unixReviewTime
0,1,True,"03 12, 2018",A3QY3THQ42WSCQ,B000YFSR5G,Waaaay too BIG,One Star,1520812800
1,1,True,"03 12, 2018",A3QY3THQ42WSCQ,B000YFSR4W,Waaaay too BIG,One Star,1520812800
2,1,True,"02 8, 2017",A21HH0VIBKK80J,B000YFSR5G,"Was terribly disappointed, the pants were way ...","Was terribly disappointed, the pants were way ...",1486512000
3,1,True,"02 8, 2017",A21HH0VIBKK80J,B000YFSR4W,"Was terribly disappointed, the pants were way ...","Was terribly disappointed, the pants were way ...",1486512000
4,1,True,"02 19, 2018",A276HQXYS553QW,B0014F8TIU,Constantly rolls down,One Star,1518998400


In [24]:
# Assigning a new [1,0] target class label based on the product rating
# if rating > 3 means +ve else -ve
df['sentiment'] = 0
df.loc[df['overall'] > 3, 'sentiment'] = 1
df.loc[df['overall'] < 3, 'sentiment'] = 0

# Removing unnecessary columns to keep a simple DataFrame
df.drop(columns=[
    'reviewTime', 'unixReviewTime', 'overall', 'reviewerID', 'summary'],
        inplace=True)
df = df.rename(columns={'reviewText': 'text'}) 
df.sample(3)

Unnamed: 0,verified,asin,text,sentiment
54600,True,B00G7UY3EG,didn't last long.,0
51091,False,B00C6CWXAA,"Too flimsy, didn't last very long and broke.",0
208525,True,B00H7RF1KS,"Nice product, easy to install, heavy duty brac...",1


In [27]:
import html
import re

def clean(text):
    # convert html escapes like &amp; to characters.
    text = html.unescape(text) 
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [28]:
df['text_orig'] = df['text'].copy()
df['text'] = df['text'].apply(clean)

In [38]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [36]:
# Alternate method that uses Wordnet POS tags instead of spaCy - can run faster with similar accuracy
# Tokenization and Lemmatization using wordnet. Re-uses parts of blueprint from Chapter 4
# Uses wordnet POS tags instead of spaCy
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer


def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [39]:
df["text"] = df["text"].apply(clean_text)

## Remove observations that are empty after the cleaning step
df = df[df['text'].str.len() != 0]

df.head()

Unnamed: 0,verified,asin,text,sentiment,text_orig
0,True,B000YFSR5G,waaaay big,0,Waaaay too BIG
1,True,B000YFSR4W,waaaay big,0,Waaaay too BIG
2,True,B000YFSR5G,terribly disappointed pant way large legs husb...,0,"Was terribly disappointed, the pants were way ..."
3,True,B000YFSR4W,terribly disappointed pant way large legs husb...,0,"Was terribly disappointed, the pants were way ..."
4,True,B0014F8TIU,constantly roll,0,Constantly rolls down


##**Split Dataset in Train and Test**

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df['text'],
                                                    df['sentiment'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df['sentiment'])

print ('Size of Training Data ', X_train.shape[0])
print ('Size of Test Data ', X_test.shape[0])
print("")

print ('Distribution of classes in Training Data :')
print ('Positive Sentiment %', str(sum(Y_train == 1)/ len(Y_train) * 100.0))
print ('Negative Sentiment %', str(sum(Y_train == 0)/ len(Y_train) * 100.0))
print("")

print ('Distribution of classes in Testing Data :')
print ('Positive Sentiment %', str(sum(Y_test == 1)/ len(Y_test) * 100.0))
print ('Negative Sentiment %', str(sum(Y_test == 0)/ len(Y_test) * 100.0))

Size of Training Data  234108
Size of Test Data  58527

Distribution of classes in Training Data :
Positive Sentiment % 50.90770071932612
Negative Sentiment % 49.09229928067388

Distribution of classes in Testing Data :
Positive Sentiment % 50.9081278726058
Negative Sentiment % 49.09187212739419


**Vectorize train and test dataset using tf/idf**

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df = 10, ngram_range=(1,1))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

##**Train model**

In [45]:
from sklearn.svm import LinearSVC

model1 = LinearSVC(random_state=42, tol=1e-5)
model1.fit(X_train_tf, Y_train)

LinearSVC(random_state=42, tol=1e-05)

In [93]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

sample_reviews = df.sample(1, random_state=22)

sample_reviews_tf = tfidf.transform(sample_reviews['text'])

sentiment_predictions = model1.predict(sample_reviews_tf)

In [94]:

corpus=df.sample(1, random_state=22)["text"]
tokens=list(map(tfidf.build_tokenizer(),corpus))


print(tokens)
print(sample_reviews_tf)
print(sentiment_predictions)

[['nice', 'night', 'light', 'much', 'else', 'apparently']]
  (0, 3081)	0.46570298463097465
  (0, 3076)	0.2690088863337611
  (0, 3004)	0.30853807089064794
  (0, 2649)	0.3200269178715212
  (0, 1503)	0.4282572990071292
  (0, 185)	0.5742278865355827
[1]


In [96]:
sample_reviews = df.sample(5, random_state=22)
sample_reviews_tf = tfidf.transform(sample_reviews['text'])
sentiment_predictions = model1.predict(sample_reviews_tf)

In [97]:
sentiment_predictions = pd.DataFrame(data = sentiment_predictions,
                                     index=sample_reviews.index,
                                     columns=['sentiment_prediction'])

sample_reviews = pd.concat([sample_reviews, sentiment_predictions], axis=1)
print ('Some sample reviews with their sentiment - ')
sample_reviews[['text_orig','sentiment_prediction']]

Y_pred = model1.predict(X_test_tf)

print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred))

Some sample reviews with their sentiment - 
Accuracy Score -  0.8658396979172006
ROC-AUC Score -  0.8660667427476778


In [98]:
print ('Some sample reviews with their sentiment - ')
sample_reviews[['text_orig','sentiment_prediction']]

Some sample reviews with their sentiment - 


Unnamed: 0,text_orig,sentiment_prediction
29500,"Its a nice night light, but not much else appa...",1
98387,"Way to small, do not know what to do with them...",0
113648,"Didn't make the room ""blue"" enough - returned ...",0
281527,Excellent,1
233713,fit like oem and looks good,1


In [100]:
def baseline_scorer(text):
    score = bing_liu_score(text)
    if score > 0:
        return 1
    else:
        return 0
    
Y_pred_baseline = X_test.apply(baseline_scorer)
acc_score = accuracy_score(Y_pred_baseline, Y_test)
print (acc_score)

0.7521998393903668
