# n-gram visualizations

This notebook is to make plots of frequent n-grams in the data

In [2]:
# note: conda environment data_review is set up for this notebook
import os

import IPython

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# these are scripts with functions made
from basic_functions import*
from text_process import*

import nltk

import plotly
from plotly import tools
import plotly.graph_objs as go

import string

In [3]:
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# load the data
bc_merged = load_BCmerged()

In [5]:
# Set stopwords
# currently is using the nltk english stopwords list
STOPWORDS = stopwords.words('english')
# removing not, no becuase those seem would be important
STOPWORDS.remove('no')
STOPWORDS.remove('not')

# Analyze top words and n-grams

This can give us a sense of what frequently comes up in the reviews. Also helpful for after the text is processed to see if we see any changes to gain insights.

### 1-grams 

i.e., top words, 
1. whole set
2. useful vs. not useful reviews based on given threshold
3. positive vs. negative reviews based on given threshold

In [7]:
# whole set
fd1 = freq_ngrams(bc_merged, STOPWORDS, 1)
df = fd1.head(50)
trace1 = set_bar_chart(df)
fig = go.Figure(data=trace1)
fig['layout'].update(height=1200, width=900, title = 'Top 50 words')
fig.show()

## Useful review words over given threshold

In [13]:
# useful reviews over a threshold
threshold = 5
bc_below_thresh = bc_merged[bc_merged['usefulCount']<threshold]
bc_above_thresh = bc_merged[bc_merged['usefulCount']>=threshold]

total_bc = bc_merged['uniqueID'].nunique()
count_below = bc_below_thresh['uniqueID'].nunique()
print('Percent of birth control reviews below threshold: %.2f percent' % (100 * count_below/total_bc))

Percent of birth control reviews below threshold: 49.56 percent


In [61]:
# useful reviews as determined by threshold
fd1 = freq_ngrams(bc_above_thresh, STOPWORDS, 1)
df1 = fd1.head(50)
trace1 = set_bar_chart(df1)

# not useful reviews as determined by threshold
fd2 = freq_ngrams(bc_below_thresh, STOPWORDS, 1)
df2 = fd2.head(50)
trace2 = set_bar_chart(df2)

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                          subplot_titles=["Top 50 words in useful reviews", 
                                          "Top 50 words in not useful reviews"])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
fig.show()

## Positive reviews over given positive threshold

In [51]:
thresh_pos = 7
bc_positive = bc_merged[bc_merged['usefulCount']>=thresh_pos]
bc_negative = bc_merged[bc_merged['usefulCount']<thresh_pos]

total_bc = bc_merged['uniqueID'].nunique()
count_positive = bc_positive['uniqueID'].nunique()
print('Percent of positive birth control reviews: %.2f percent' % (100 * count_positive/total_bc))

Percent of positive birth control reviews: 36.17 percent


In [None]:
# positive reviews as determined by thresh_pos
fd_pos = freq_ngrams(bc_positive, STOPWORDS, 1)
df_pos= fd_pos.head(50)
trace_pos = set_bar_chart(df_pos)

# negative reviews as determined by thresh_pos
fd_neg = freq_ngrams(bc_negative, STOPWORDS, 1)
df_neg = fd_neg.head(50)
trace_neg = set_bar_chart(df_neg)

In [54]:
# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                          subplot_titles=["Top 50 words in positive reviews", 
                                          "Top 50 words in negative reviews"])
fig.append_trace(trace_pos, 1, 1)
fig.append_trace(trace_neg, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
fig.show()

### Bigrams
bigrams of
1. whole set
2. useful vs. not useful reviews
3. positive vs. negative reviews

In [8]:
# whole set
fd2 = freq_ngrams(bc_merged, STOPWORDS, 2)
df = fd2.head(50)
trace1 = set_bar_chart(df)
fig = go.Figure(data=trace1)
fig['layout'].update(height=1200, width=900, title = 'Top 50 2-grams')
fig.show()

In [38]:
# useful reviews as determined by threshold
fd1_bi = freq_ngrams(bc_above_thresh, STOPWORDS, 2)
df1_bi = fd1_bi.head(50)
trace1_bi = set_bar_chart(df1_bi)

# not useful reviews as determined by threshold
fd2_bi = freq_ngrams(bc_below_thresh, STOPWORDS, 2)
df2_bi = fd2_bi.head(50)
trace2_bi = set_bar_chart(df2_bi)

In [41]:
# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04, horizontal_spacing=0.2,
                          subplot_titles=["Top 50 bigrams in useful reviews", 
                                          "Top 50 bigrams in not useful reviews"])
fig.append_trace(trace1_bi, 1, 1)
fig.append_trace(trace2_bi, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Bigram Plots")
fig.show()

In [58]:
# positive reviews as determined by thresh_pos
fd_pos2 = freq_ngrams(bc_positive, STOPWORDS, 2)
df_pos2= fd_pos2.head(50)
trace_pos2 = set_bar_chart(df_pos2)

# negative reviews as determined by thresh_pos
fd_neg2 = freq_ngrams(bc_negative, STOPWORDS, 2)
df_neg2 = fd_neg2.head(50)
trace_neg2 = set_bar_chart(df_neg2)

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04, horizontal_spacing=0.2,
                          subplot_titles=["Top 50 bigrams in positive reviews", 
                                          "Top 50 bigrams in negative reviews"])
fig.append_trace(trace_pos2, 1, 1)
fig.append_trace(trace_neg2, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Bigram Plots")
fig.show()

### Tri-grams

3-grams of the whole set

In [42]:
# whole set
fd3 = freq_ngrams(bc_merged, STOPWORDS, 3)
df_tri = fd3.head(50)
trace1_tri = set_bar_chart(df_tri)
fig = go.Figure(data=trace1_tri)
fig['layout'].update(height=1200, width=900, title = 'Top 50 3-grams')
fig.show()

In [43]:
# useful reviews as determined by threshold
fd3 = freq_ngrams(bc_above_thresh, STOPWORDS, 3)
df3 = fd3.head(50)
trace1_tri = set_bar_chart(df3)

# not useful reviews as determined by threshold
fd3_1 = freq_ngrams(bc_below_thresh, STOPWORDS, 3)
df3_1 = fd3_1.head(50)
trace2_tri = set_bar_chart(df3_1)

In [44]:
# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04, horizontal_spacing=0.3,
                          subplot_titles=["Top 50 trigrams in useful reviews", 
                                          "Top 50 trigrams in not useful reviews"])
fig.append_trace(trace1_tri, 1, 1)
fig.append_trace(trace2_tri, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Trigram Plots")
fig.show()

In [None]:
# positive reviews as determined by thresh_pos
fd_pos3 = freq_ngrams(bc_positive, STOPWORDS, 3)
df_pos3= fd_pos3.head(50)
trace_pos3 = set_bar_chart(df_pos3)

# negative reviews as determined by thresh_pos
fd_neg3 = freq_ngrams(bc_negative, STOPWORDS, 3)
df_neg3 = fd_neg3.head(50)
trace_neg3 = set_bar_chart(df_neg3)

In [60]:
# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04, horizontal_spacing=0.3,
                          subplot_titles=["Top 50 trigrams in positive reviews", 
                                          "Top 50 trigrams in negative reviews"])
fig.append_trace(trace_pos3, 1, 1)
fig.append_trace(trace_neg3, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Trigram Plots")
fig.show()

## 4-grams
4-grams of the whole set

In [10]:
# whole set
fd4 = freq_ngrams(bc_merged, STOPWORDS, 4)
df = fd4.head(50)
trace1 = set_bar_chart(df)
fig = go.Figure(data=trace1)
fig['layout'].update(height=1200, width=900, title = 'Top 50 4-grams')
fig.show()

In [48]:
# useful reviews as determined by threshold
fd4 = freq_ngrams(bc_above_thresh, STOPWORDS, 4)
df4 = fd4.head(50)
trace1_4 = set_bar_chart(df4)

# not useful reviews as determined by threshold
fd4_1 = freq_ngrams(bc_below_thresh, STOPWORDS, 4)
df4_1 = fd4_1.head(50)
trace2_4 = set_bar_chart(df4_1)

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04, horizontal_spacing=0.35,
                          subplot_titles=["Top 50 4-grams in useful reviews", 
                                          "Top 50 4-grams in not useful reviews"])
fig.append_trace(trace1_4, 1, 1)
fig.append_trace(trace2_4, 1, 2)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="4-gram Plots")
fig.show()