# <font color=blue> EXPLORATORY DATA ANLAYSIS & SENTIMENT ANALYSIS </font>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
books = pd.read_csv('cleaned_books.csv')
books.head()

In [None]:
books.info()

## 1.0 Exporatory analyses on Data Science books 📚

### 1.1 Highest & Lowest Prices🤑

In [None]:
books.nlargest(1, ['book_cost($)'])

In [None]:
books.nsmallest(1, ['book_cost($)'])

In [None]:
#Check lowest cost for books that are not 0$

books[books['book_cost($)']>0].nsmallest(1, ['book_cost($)'])

> Good to see that the book with the lowest cost from the dataset has a very good rating!

### 1.2 💰 Price vs. reviews

In [None]:
px.scatter(books, x="book_cost($)", y="rating",size="rating_count")

> Book rating is not related to price: Some costly books have poor rating and some highly rated books are relatively cheap too.

In [None]:
# Select books based on title containing "Python"
python_books = books[books['title'].str.contains("Python")]

# Python books with most reviews and highest average rating
best_python_books = python_books.nlargest(5, ['rating_count','rating'])
best_python_books

In [None]:
# Select books based on title containing "Machine Learning"
ml_books = books[books['title'].str.contains("Machine Learning")]

# ML books with most reviews and highest average rating
best_ml_books = ml_books.nlargest(5, ['rating_count','rating'])
best_ml_books

### 1.3 Distribution of Ratings Score per Book

___
Books without any rating/review have an overall default rating of zero.

It is important to exclude these books whwn attempting to do an overall rating preview of all the books so as not to deflect the results since there are more books without ratings than books with ratings.
___

In [None]:
#filter out books/rows with rating = 0
df_rating = books[books['rating']>0.99]

In [None]:
#create of plot of stars distribution
ax = df_rating['rating'].sort_index().plot.hist(figsize=(10, 5), bins=5)

ax.set_xlabel('Review Stars')
plt.show()


* Most of the rated books were rated 4-out-of-5 🌟

### 1.4 Most Popular Genres

In [None]:
#generate a list of words from genres of each book/row 
genres = books['genre(s)'].dropna()
genres = [g for g in genres if 'Genres' in g]#
genres = [g.replace('Genres, ','') for g in genres]

In [None]:
#generate wordcloud image
unique_string=(" ").join(genres)
wordcloud = WordCloud(max_font_size=60, max_words=30, background_color="black").generate(unique_string)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud)
plt.axis("off")
#plt.savefig("your_file_name"+".png", bbox_inches='tight')
plt.show()
plt.close()

### 1.5 🧐 Clustering Book by Titles 

> 💡 What are the main types of Data Science books?
____

A simple way of collecting similar book based on title, possible content.

KMeans Clustering method will be used to achieve this clustering.
____

#### 1.5.1 Applying the Tfidf Vectorizer

In [None]:
#Initiate Vectorizer

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))

X = vectorizer.fit_transform(books["title"])

In [None]:
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

#### 1.5.2 Finding the Best Number of Clusters 'K'

In [None]:
# Instantiate the Kmeans clustering model
km = KMeans(n_clusters=k, max_iter=600, n_init=10)

#iterate through a range up to 10 to find K
sum_of_squared_distances = []

K = range(2,10)
for k in K:
   km.fit(X)
   sum_of_squared_distances.append(km.inertia_)

In [None]:
plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method For Optimal k')
plt.show()

> Visually we can see that the optimal number of clusters should be around 3 or 4. But visualizing/visualization of the data alone cannot always give the right answer.

#### 1.5.3 Finding the Best Number of Clusters (Silhoutte Coefficient)

In [None]:
silhouette_avg = []
for k in K:
 
     # initialise kmeans
    kmeans = KMeans(n_clusters=k, max_iter=600, n_init=10)
    kmeans.fit(X)
    cluster_labels = kmeans.labels_

     # silhouette score
    silhouette_avg.append(silhouette_score(X, cluster_labels))

    
plt.plot(K, silhouette_avg)
plt.xlabel('Values of K') 
plt.ylabel('Silhouette score') 
plt.title('Silhouette analysis For Optimal k')
plt.show()

> The silhoutte coefficient continues to rise in a typical sinusoidal movement, yet very low. This indicates a high possibility of poor mismatching between clusters.

> Low matching accuracy, w.r.t. our data here, infers the closeness in word choices used in titles of the varous genres.

> For this project, a classification of 4 should work!

#### 1.5.4 Create the Clusters

In [None]:
# Get clusters
true_k = 4
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=600, n_init=10)
model.fit(X)

# Get prediction/ labels
labels = model.labels_
book_cl = pd.DataFrame(list(zip(books["title"],labels)),columns=['title','cluster'])
print(book_cl.sort_values(by=['cluster']))


###introduce silhoutte score-----


In [None]:
labels = model.labels_
labels[:10]

In [None]:
# Create wordclouds for clusters
for k in range(true_k):
   text = book_cl[book_cl.cluster == k]['title'].str.cat(sep=' ')
   wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)

   # Create subplot
   plt.subplot(2, 3, k+1).set_title("Cluster " + str(k)) 
   plt.plot()
   plt.imshow(wordcloud, interpolation="bilinear")
   plt.axis("off")
plt.show()

In [None]:
# Prediction on unseen data
test = vectorizer.transform(['How to Become a Top Programmer'])
model.predict(test)[0]

In [None]:

cluster_num = '2'

# Books in clusters
book_cl[book_cl.cluster == int(cluster_num)].head()

## 2.0 Book Reviews Summarization

In [None]:
# # Summarizing book reviews
# from summarizer import Summarizer

# bert_model = Summarizer()
# bert_summary = ''.join(bert_model(books.foreward[2], ratio = 0.2))
# print(bert_summary)

In [None]:
###!pip install sacremoses

## 3.0 Sentiment Analysis of Reviews

In this section we will be doing some sentiment analysis in python using three different techniques:
1. VADER (Valence Aware Dictionary and sEntiment Reasoner) - Bag of words approach
2. Roberta Pretrained Model from _HuggingFace_ 🤗

In [None]:
#import the book reviews data
reviews = pd.read_csv('book_reviews.csv')
reviews = reviews.rename(columns={'Unnamed: 0':'Id', 'rating':'stars'})
reviews.head()

In [None]:
reviews.info()

In [None]:
reviews.stars.value_counts()

In [None]:
reviews = reviews[reviews['stars'].str.contains('el="Review ')==False]
reviews.stars.value_counts()

### 3.1 Quick EDA

___
A quick clean of the rating column is required!

To process, the rating column should contain single digits for number of stars given by each book reviewer. Column dtype also formatted.
___

In [None]:
#replace the column by retaining only the score component of the review rating
reviews['stars'] = [s[1] for s in reviews.stars]

#change column type
reviews['stars'] = reviews['stars'].astype('int')

#nltk analyzer will not be able to process NaNs as texts, hence will throw an error
reviews.dropna(inplace=True)

#review cleaned data info
reviews.info()

In [None]:
ax = reviews['stars'].value_counts().sort_index() \
    .plot(kind='bar',
          title='Count of Reviews by Stars',
          figsize=(10, 5))
ax.set_xlabel('Review Stars')
plt.show()

### 3.2 Natural Language Toolkit

In [None]:
#get a sample review
example = reviews['review'][266]
print(example)

In [None]:
#tokenize the sample review
tokens = nltk.word_tokenize(example)
tokens[:10]

In [None]:
#get token tags
tagged = nltk.pos_tag(tokens)
tagged[:10]


In [None]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

#### 3.2.1 VADER Seniment Scoring

We will use NLTK's `SentimentIntensityAnalyzer` to get the neg/neu/pos scores of the text.

- This uses a "bag of words" approach:
    1. Stop words are removed
    2. each word is scored and combined to a total score.

In [None]:
# Instantiate the VADER sentiment analyzer model

sia = SentimentIntensityAnalyzer()

In [None]:
#get the polarity score for the sample
sia.polarity_scores(example)

In [None]:
# Run the polarity score on the entire dataset
res = {}
for i, row in tqdm(reviews.iterrows(), total=len(reviews)):
    text = row['review']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(reviews, how='left')

In [None]:
# Now we have sentiment score and metadata
vaders.head()

In [None]:
import seaborn as sns
ax = sns.barplot(data=vaders, x='stars', y='compound')
ax.set_title('Compund Score by Amazon Star Review')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vaders, x='stars', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='stars', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='stars', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

#### 3.2.2 Roberta Pretrained Model

- Use a model trained on a large corpus of data.
- Transformer model accounts for the words but also the context related to other words.

In [None]:
#INstantiate the transformer model
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# reprint VADER results on example for context
print(example)
sia.polarity_scores(example)

In [None]:
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
output

In [None]:
#detach the results from the output
scores = output[0][0].detach().numpy()
scores

In [None]:
#transform the numpy result using 'softmax' --- exponantial transformation
scores = softmax(scores)
scores

In [None]:
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

> The Roberta model is able to capture nuances in expressions, and in this circumstance able to capture the negative sentiment in the sample expression, as against the VADER result that was unable to determine the negativity or positivity, hence giving more weight to 'neutral' position.

In [None]:
#wrap the flow in a function
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

#### 3.2.3 Combine and Compare Both Models

In [None]:
#re-run the two models on the corpus to extract results
res = {}
for i, row in tqdm(reviews.iterrows(), total=len(reviews)):
    try: #roberta model can only handle text of certain lenght of texts before it throws 'runtime' error
        text = row['review']
        myid = row['Id']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')

In [None]:
#combine both results with the main dataframe
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(reviews, how='left')
results_df.head()

#### 3.2.3.1 Compare Scores between models

In [None]:
#fetch column names
results_df.columns

In [None]:
#Using a PairPlot chat to compare
sns.pairplot(data=results_df,
             vars=['vader_neg', 'vader_neu', 'vader_pos',
                  'roberta_neg', 'roberta_neu', 'roberta_pos'],
            hue='stars',
            palette='tab10')
plt.show()

### 3.3 Review Differences in Perfromane of Both Models on Actual Ratings

Lets look at some examples where the model scoring and review score differ the most.

#### 3.3.1 Positive 1-Star

In [None]:
results_df.query('stars == 1') \
    .sort_values('roberta_pos', ascending=False)['review'].values[0]

In [None]:
results_df.query('stars == 1') \
    .sort_values('vader_pos', ascending=False)['review'].values[0]

 #### 3.3.2 Negative 5-Star Reviews

In [None]:
results_df.query('stars == 5') \
    .sort_values('roberta_neg', ascending=False)['review'].values[0]

In [None]:
results_df.query('stars == 5') \
    .sort_values('vader_neg', ascending=False)['review'].values[0]

## 4.0 Conclusion

1. The ```VADER``` model is powerful and useful in quickly drawing sentiments from straight expressions.
2. The ```Roberta``` model, beyond the capabilities of the _VADER_ model is able to discern nuances such as when sarcasm is meant, and evaluate such expression appropriately by considering the context around words.
3. Human expression through language is diverse and as can be seen in the last section, there may be instances where model results will vary from true sentiments. Sometimes, the error may come from human input or the use of wrong words in the expression of thought.

## 5.0 References

* [Python Sentiment Analysis Project with NLTK - Rob Mulla](https://www.youtube.com/watch?v=QpzMWQvxXWk&t=1s)
* [I Analyzed 1000 Data Science Books on Amazon: Here's What I Found 🤓 - Thu Vu](https://www.youtube.com/watch?v=N0o-Bjiwt0M)