In [1]:
import os
import sys
import inspect
import random
import pandas as pd
import numpy as np
from tqdm import trange
from collections import OrderedDict, Counter
from wordcloud import ImageColorGenerator, WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt

import ipywidgets as widgets
from IPython.display import display, Markdown, HTML, clear_output, display_html

In [2]:
from src.preprocess import Analysis

In [3]:
display(Markdown("<h2>Scraped Book Reviews Understanding</h2>"))
loading_section       = ["Load Data"]
sections              = ["Data Overview", "EDA"]
conclusion_section    = ["Summary"]

summary_sub_section   = ["Project Summary", "Data Summary"]
features_sub_section  = ["Features Analysis", "Word Cloud"]

accordions = OrderedDict()
accordions["** Loading **"] = widgets.Accordion(children=[widgets.Output() for section in loading_section])
[accordions["** Loading **"].set_title(i, section) for i, section in enumerate(loading_section)]

for section in sections:
    if section == "Data Overview":
        accordions[section] = widgets.Accordion(children=[widgets.Output() for sub_section in summary_sub_section])
        [accordions[section].set_title(i, sub_section) for i, sub_section in enumerate(summary_sub_section)]
    else:
        accordions[section] = widgets.Accordion(children=[widgets.Output() for sub_section in features_sub_section])
        [accordions[section].set_title(i, sub_section) for i, sub_section in enumerate(features_sub_section)]
        
accordions["** Conclusion **"] = widgets.Accordion(children=[widgets.Output() for section in conclusion_section])
[accordions["** Conclusion **"].set_title(i, section) for i, section in enumerate(conclusion_section)]
        
widget_fields = widgets.Tab(children=[accordions[t] for t in accordions])
[widget_fields.set_title(i, sub) for i, sub in enumerate(accordions.keys())]

<h2>Scraped Book Reviews Understanding</h2>

[None, None, None, None]

In [4]:
widget_fields

Tab(children=(Accordion(children=(Output(),), titles=('Load Data',)), Accordion(children=(Output(), Output()),…

In [5]:
analysis = Analysis()

In [6]:
%matplotlib agg

with accordions["** Loading **"].children[0]:
    clear_output()
    display(Markdown("<h2> Initiating data loading ... </h2>"))
    analysis.get_reviews_data()

In [7]:
section = "Data Overview"
    
with accordions[section].children[0]:
    clear_output()
    display(Markdown("<h2> Book Reviews Sentiment Analysis </h2>"))
    display(Markdown(r"""<p align="center">
                         <img width="1000" height="400" src="src/static/img/sentiments.jpg"></p>"""))
    sentiment_analysis_markdown = "src/templates/sentiment_analysis.md"
    display(Markdown(sentiment_analysis_markdown))
    
    
with accordions[section].children[1]:
    clear_output()
    display(Markdown(r"<h2> Reviews Data Extracted </h2"))    
    display(Markdown(r"<h4> Overview of Data Quantity </h4>"))
    display(analysis.grid_df_display([analysis.descriptive_data(analysis.data["reviews_abt"]), analysis.data_type_analysis(analysis.data["reviews_abt"])]))
    
    display(Markdown(r"<h4> Understand Distribution Between Sentiments of Reviews </h4>"))
    rating_count_df = (analysis.data["reviews_abt"]["rating_encode"]
                       .value_counts()
                       .reset_index()
                       )
    rating_count_df["count_pct"] = (rating_count_df["count"] / rating_count_df["count"].sum())*100
    display(analysis.vertical_bar_plot(df=rating_count_df, xvar="rating_encode", yvar="count"))
    display(rating_count_df)

In [8]:
section = "EDA"
    
with accordions[section].children[0]:
    clear_output()
    display(Markdown("<h2> Exploratory Data Analysis on Scraped Reviews Data </h2>"))
    display(Markdown(r"<h4> 1. Disitribution of Reviews Length </h4>"))
    display(analysis.distribution_plot(df=analysis.data["reviews_abt"], xvar="reviews_length"))
    
    display(Markdown(r"<h4> 2. Term Frequency Analysis </h4>"))
    most_common_df = analysis.term_frequency_analysis(df=analysis.data["reviews_abt"], reviews_col="reviews", top_sample_size=20)
    display(analysis.horizontal_bar_plot(df=most_common_df, xvar="frequency", yvar="words"))
    
    display(Markdown(r"<h4> 3. Most Occuring Bi-Gram Words </h4>"))
    bi_ngram_freq = analysis.create_n_grams(df=analysis.data["reviews_abt"][analysis.data["reviews_abt"]["rating"].notnull()].sample(5000), 
                                            reviews_col="reviews", 
                                            ngram_range=(2,2))
    display(analysis.horizontal_bar_plot(df=bi_ngram_freq[:20], xvar="frequency", yvar="ngram"))
    
    display(Markdown(r"<h4> 3. Most Occuring Tri-Gram Words </h4>"))
    tri_ngram_freq = analysis.create_n_grams(df=analysis.data["reviews_abt"][analysis.data["reviews_abt"]["rating"].notnull()].sample(5000), 
                                             reviews_col="reviews", 
                                             ngram_range=(3,3))
    display(analysis.horizontal_bar_plot(df=tri_ngram_freq[:20], xvar="frequency", yvar="ngram"))
    
    
with accordions[section].children[1]:
    clear_output()
    display(Markdown(r"<h2> Words Cloud </h2"))
    display(Markdown(r"<h4> Seeing the Most Common Words </h4>"))
    text = " ".join(review for review in analysis.data["reviews_abt"][analysis.data["reviews_abt"]["rating"].notnull()]["reviews"].astype(str))
    display(analysis.wordcloud_plot(text))