# <u>***EDA and engineering of Quality***<u>

---
---

### ***1. EDA -- What makes a review a good review?***

#### <u>used dataset: review_1918.csv<u>

---
---

#### ***1.1 Import necessary modules***

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import string
import re

import nltk
from nltk.corpus import stopwords

# -------------------------------- 

import spacy
import contextualSpellCheck
import en_core_web_sm
from spacy.lang.en.examples import sentences

#!ln -s /Users/felixbecker/neuefische/Yelp-Capstone/modeling/Language.py Language.py
#from Language import language_processing

from wordcloud import WordCloud

import unicodedata

---

#### ***1.2 Set global parameters***

In [None]:
import warnings
warnings.filterwarnings("ignore")

plt.rcParams['figure.figsize'] = 6, 4
plt.rcParams['figure.dpi'] = 150

---

#### ***1.3 Import dataset***

In [None]:
df_rev = pd.read_csv('../data/yelp_dataset/review_1819.csv')

In [None]:
df_rev.query("useful != 0").count()

---

#### ***1.4 Overview of the dataset***

In [None]:
df_rev.shape

In [None]:
df_rev.info()

In [None]:
df_rev.sample(10)

In [None]:
df_rev.isnull().sum()

In [None]:
df_rev['stars'] = df_rev['stars'].astype(int)

---

#### ***1.5 Cleaning the dataset***

In [None]:
# Dropping 'Unnamed: 0', 'date', 'year'

df_rev.drop(['Unnamed: 0', 'date', 'year'], axis=1, inplace=True)

---

#### ***1.6 Gain information on rating***

In [None]:
palette = ['#43948c', '#36a097', '#28aea2', '#1bbbad', '#0dc9b8']

In [None]:
ax = sns.countplot(data=df_rev, x='stars', palette=sns.color_palette(palette, 5), zorder=2)
plt.title('Distribution of star ratings')
plt.ylim(0, 1000000)
plt.xlabel('Stars')
plt.ylabel('Count')
plt.ticklabel_format(style='plain', axis='y')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
# plt.bar_label(ax.containers[0], padding=-15);

In [None]:
ax = sns.countplot(data=df_rev.query('useful != 0'), x='stars', palette='viridis_r', zorder=2)
plt.ylim(0, 350000)
plt.ylabel('# of reviews rated useful')
plt.title('Number of reviews ranked "useful" per star rating')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.bar_label(ax.containers[0], padding=-15);

In [None]:
ax = sns.barplot(data=df_rev, x='stars', y='useful', estimator=lambda x : round(sum(x==0)*100.0/len(x),2), palette='viridis_r', zorder=2)
plt.ylim(0, 100)
plt.ylabel('% of reviews rated useful')
plt.title('Percentage of reviews ranked "useful" per star rating')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.bar_label(ax.containers[0], padding=-15);

In [None]:
ax = sns.countplot(data=df_rev.query('funny != 0'), x='stars', palette='viridis_r', zorder=2)
plt.ylim(0, 80000)
plt.ylabel('# of reviews rated funny')
plt.title('Number of reviews ranked "funny" per star rating')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.bar_label(ax.containers[0], padding=-15);

In [None]:
ax = sns.barplot(data=df_rev, x='stars', y='funny', estimator=lambda x : round(sum(x==0)*100.0/len(x),2), palette='viridis_r', zorder=2)
plt.ylim(0, 100)
plt.ylabel('% of reviews rated funny')
plt.title('Percentage of reviews ranked "funny" per star rating')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.bar_label(ax.containers[0], padding=-15);

In [None]:
ax = sns.countplot(data=df_rev.query('cool != 0'), x='stars', palette='viridis_r', zorder=2)
plt.ylim(0, 250000)
plt.ylabel('# of reviews rated cool')
plt.title('Number of reviews ranked "cool" per star rating')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.bar_label(ax.containers[0], padding=-12);

In [None]:
ax = sns.barplot(data=df_rev, x='stars', y='cool', estimator=lambda x : round(sum(x==0)*100.0/len(x),2), palette='viridis_r', zorder=2)
plt.ylim(0, 100)
plt.ylabel('% of reviews rated cool')
plt.title('Percentage of reviews ranked "cool" per star rating')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.bar_label(ax.containers[0], padding=-15);

In [None]:
df_corr = df_rev.drop(['review_id', 'user_id', 'business_id'], axis=1)

In [None]:
sns.heatmap(df_corr.corr(), annot=True, cmap='viridis_r', linewidth=0.01, linecolor='k', vmin=-1, vmax=1)

---

#### ***1.7 Gain information on texts***

In [None]:
# Get the number of unique words per review and save it to new column

df_rev['unique_words'] = df_rev['text'].apply(lambda x : len(set(str(x).split())))

# Get the number of used exclamation marks per review

df_rev['count_excl'] = df_rev['text'].str.count('!')

# Remove punctuation

df_rev["no_punct"] = df_rev['text'].str.replace('[^\w\s]','')

# Get the length of each review and save it to new column

df_rev['text_length'] = df_rev['no_punct'].str.len()

In [None]:
# Calculate mean text length per star rating and save in list

mean_lengths = []
mean_text_one = df_rev.query('stars == 1').text_length.mean().astype(int)
mean_lengths.append(mean_text_one)
mean_text_two = df_rev.query('stars == 2').text_length.mean().astype(int)
mean_lengths.append(mean_text_two)
mean_text_three = df_rev.query('stars == 3').text_length.mean().astype(int)
mean_lengths.append(mean_text_three)
mean_text_four = df_rev.query('stars == 4').text_length.mean().astype(int)
mean_lengths.append(mean_text_four)
mean_text_five = df_rev.query('stars == 5').text_length.mean().astype(int)
mean_lengths.append(mean_text_five)

mean_stars = [1.0, 2.0, 3.0, 4.0, 5.0]

In [None]:
ax = sns.barplot(x=mean_stars, y=mean_lengths, palette='viridis_r', zorder=2)
plt.title('Mean text length per star rating')
plt.ylim(0, 800)
plt.ylabel('mean text length')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.bar_label(ax.containers[0], padding=-15);

In [None]:
sns.distplot(df_rev['text_length'], bins=100)
plt.title('Distribution of text length')
plt.xlim(0, 6000)
plt.ylim(0, 0.00225)
plt.xlabel('text length')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)

In [None]:
sns.distplot(df_rev['unique_words'], bins=100)
plt.title('Distribution of unique words')
plt.xlim(0, 600)
plt.ylim(0, 0.014)
plt.xlabel('# of unique words')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)

In [None]:
ax = sns.barplot(data=df_rev, x='stars', y='count_excl', estimator=lambda x : round(sum(x==0)*100.0/len(x),2), palette='viridis_r', zorder=2)
plt.title('Percentage of exclamation marks per star rating ')
plt.ylim(0, 100)
plt.ylabel('% of exlamation marks')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.bar_label(ax.containers[0], padding=-15);

In [None]:
df_corr_excl = df_rev.drop(['review_id', 'user_id', 'business_id'], axis=1)

In [None]:
sns.heatmap(df_corr_excl.corr(), annot=True, cmap='viridis_r', linewidth=0.01, linecolor='k', vmin=-1, vmax=1)

---
---

### ***2. Language and spelling***

---
---

#### ***2.1 Language processing***

In [None]:
# Predict the language per review with a certainty of at least 95%
# Drop all other languages than english
# Return the corresponding dataframe

language_processing(df_rev);

In [None]:
# Cleaning in language_processing is not applied in place!

df_rev = df_rev[df_rev['language'] == 'English']

---

#### ***2.2 Create word clouds for useful, funny and cool***

In [None]:
df_use = df_rev[df_rev['useful'] != 0]
df_fun = df_rev[df_rev['funny'] != 0]
df_cool = df_rev[df_rev['cool'] != 0]

In [None]:
# initialize the stopword list:
stopwords = nltk.corpus.stopwords.words('english')

# update the stopwords after generating the first few clouds with non decisive words
additional_stopwords = ['one', 'go', 'also', 'would', 'get', 'got']
stopwords.extend(additional_stopwords)

# create a wordcloud using all the text in text
text_use = " ".join(text for text in df_use['text'])
text_fun = " ".join(text for text in df_fun['text'])
text_cool = " ".join(text for text in df_cool['text'])

#remove the stopwords from the text
wordcloud_use = WordCloud(stopwords=stopwords).generate(text_use)
wordcloud_fun = WordCloud(stopwords=stopwords).generate(text_use)
wordcloud_cool = WordCloud(stopwords=stopwords).generate(text_use)

#### ***World Cloud for reviews rated as useful***

In [None]:
plt.imshow(wordcloud_use, interpolation='bilinear')
plt.axis("off");

#### ***World Cloud for reviews rated as funny***

In [None]:
plt.imshow(wordcloud_fun, interpolation='bilinear')
plt.axis("off");

#### ***World Cloud for reviews rated as cool***

In [None]:
plt.imshow(wordcloud_cool, interpolation='bilinear')
plt.axis("off");
#print(wordcloud_cool.words_.keys())

---

#### ***2.3 Text cleaning and building N-grams for useful, funny and cool rated reviews***

In [None]:
# Basic text cleaning and Lemmatization

def text_cleaning(txt):
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')
    additional_stopwords = ['one', 'go', 'also', 'would', 'get', 'got']
    stopwords.extend(additional_stopwords)
    txt = (unicodedata.normalize('NFKD', txt)).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower()
    words = re.sub(r'[^\w\s]', '', txt).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [None]:
# Apply the basic text cleaning and Lemmatization on each word list

words_use = text_cleaning(''.join(str(df_use['text'].tolist())))
words_fun = text_cleaning(''.join(str(df_fun['text'].tolist())))
words_cool = text_cleaning(''.join(str(df_cool['text'].tolist())))

In [None]:
# "Building" the N-grams of size 3 (Trigrams)
# CAREFUL THIS TAKES MORE THAN 4 HOURS

trigrams_use = (pd.Series(nltk.ngrams(words_use, 3)).value_counts())[:10]
trigrams_fun = (pd.Series(nltk.ngrams(words_fun, 3)).value_counts())[:10]
trigrams_cool = (pd.Series(nltk.ngrams(words_cool, 3)).value_counts())[:10]

In [None]:
ax = sns.barplot(x=trigrams_use.values, y=trigrams_use.index, palette='viridis_r', zorder=2)
plt.title('Most common trigrams in "useful"')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.xlim(0,4500)
plt.xlabel('# of occurrences')
plt.ylabel('Trigrams')
plt.bar_label(ax.containers[0], padding=-30);

In [None]:
ax = sns.barplot(x=trigrams_fun.values, y=trigrams_fun.index, palette='viridis_r', zorder=2)
plt.title('Most common trigrams in "funny"')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.xlim(0,1200)
plt.xlabel('# of occurrences')
plt.ylabel('Trigrams')
plt.bar_label(ax.containers[0], padding=-30);

In [None]:
ax = sns.barplot(x=trigrams_cool.values, y=trigrams_cool.index, palette='viridis_r', zorder=2)
plt.title('Most common trigrams in "cool"')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.xlim(0,3000)
plt.xlabel('# of occurrences')
plt.ylabel('Trigrams')
plt.bar_label(ax.containers[0], padding=-30);

---

#### ***2.4 Create word clouds for reviews with star ratings ≤ 2***

In [None]:
df_bad = df_rev[df_rev['stars'] <= 2]

In [None]:
# initialize the stopword list:
stopwords = nltk.corpus.stopwords.words('english')

# update the stopwords after generating the first few clouds with non decisive words
additional_stopwords = ['one', 'go', 'also', 'would', 'get', 'got']
stopwords.extend(additional_stopwords)

text_bad = " ".join(text for text in df_bad['text'])
wordcloud_bad = WordCloud(stopwords=stopwords).generate(text_bad)

#### ***Word Cloud for "bad" reviews***

In [None]:
plt.imshow(wordcloud_bad, interpolation='bilinear')
plt.axis("off");

In [None]:
words_bad = text_cleaning(''.join(str(df_bad['text'].tolist())))

In [None]:
trigrams_bad = (pd.Series(nltk.ngrams(words_bad, 3)).value_counts())[:10]

In [None]:
ax = sns.barplot(x=trigrams_bad.values, y=trigrams_bad.index, palette='viridis_r', zorder=2)
plt.title('Most common trigrams in "bad" reviews (star rating ≤ 2)')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.xlim(0,3500)
plt.xlabel('# of occurrences')
plt.ylabel('Trigrams')
plt.bar_label(ax.containers[0], padding=-30);

---

#### ***2.5 Create word clouds for reviews with star ratings ≥ 4***

In [None]:
df_good = df_rev[df_rev['stars'] >= 4]

In [None]:
# initialize the stopword list:
stopwords = nltk.corpus.stopwords.words('english')

# update the stopwords after generating the first few clouds with non decisive words
additional_stopwords = ['one', 'go', 'also', 'would', 'get', 'got']
stopwords.extend(additional_stopwords)

text_good = " ".join(text for text in df_good['text'])
wordcloud_good = WordCloud(stopwords=stopwords).generate(text_good)

In [None]:
plt.imshow(wordcloud_good, interpolation='bilinear')
plt.axis("off");

In [None]:
words_good = text_cleaning(''.join(str(df_good['text'].tolist())))

In [None]:
trigrams_good = (pd.Series(nltk.ngrams(words_good, 3)).value_counts())[:10]

In [None]:
ax = sns.barplot(x=trigrams_good.values, y=trigrams_good.index, palette='viridis_r', zorder=2)
plt.title('Most common trigrams in "good" reviews (star rating ≥ 4)')
plt.grid(which='major', axis='both', color='#C9C9C9', linestyle=':', zorder=0)
plt.xlim(0,10000)
plt.xlabel('# of occurrences')
plt.ylabel('Trigrams')
plt.bar_label(ax.containers[0], padding=-30);

---
---

### ***3. Rating and Rating***

---
---

#### ***3.1 Useful 1-5 Stars***