In [3]:
%git clone https://github.com/Hotsnown/seminaire-bordeaux-2022.git seminaire &> /dev/null
%pip install nbautoeval &> /dev/null
from evaluation.jour2.listes.listes import exo_create_list, exo_add_list, exo_lenght, exo_get_item, exo_is_empty, exo_less_than_5, exo_first_last

UsageError: Line magic function `%git` not found.


Introduction

This is the start of a series of tutorials about natural language processing (NLP). In other words, we're going to teach the machine how to read! First, we'll see how to do simple text mining on the yelp dataset with pandas.

The yelp dataset contains over 6 million text reviews from users on businesses, as well as their rating. This dataset is interesting because it is large enough to train advanced machine learning models like LSTMs (Long Short-Term Memories). It is also large enough to be fairly challenging to process.

In this first post, you will learn how to:

access and understand the yelp dataset
convert it to a pandas DataFrame for simple text mining.

The yelp dataset

Reading JSON and loading the pandas DataFrame


In [4]:
import json
import pandas as pd
import matplotlib.pyplot as plt

# open input file: 
ifile = open('review.json') 

# read the first 100k entries
# set to -1 to process everything
stop = 100000

all_data = list()
for i, line in enumerate(ifile):
    if i%10000==0:
        print(i)
    if i==stop:
        break    
    # convert the json on this line to a dict
    data = json.loads(line)
    # extract what we want
    text = data['text']
    stars = data['stars']
    # add to the data collected so far
    all_data.append([stars, text])
# create the DataFrame
df = pd.DataFrame(all_data, columns=['stars','text'])
print(df)
# df.to_hdf('revie20ws.h5','reviews')

ifile.close()

ModuleNotFoundError: No module named 'pandas'

# Blueprint: Getting an Overview of the Data with Pandas


## Calculating Summary Statistics for Columns


My goal is not to teach you pandas here, as there are excellent tutorials around. Instead, I would like to show you how powerful and fast it is.

In ipython, after running the script, we have interactive access to our DataFrame object, called df.

Let's start by looking at the possible values for stars:

In [None]:
stars = df['stars'] 
sorted(stars.unique())
# [1.0, 2.0, 3.0, 4.0, 5.0]

In [None]:
df['length'] = df['text'].str.len()

df.describe().T

In [None]:
df[['country', 'speaker']].describe(include='O').T

## Checking for Missing Data

In [None]:
df.isna().sum()

In [None]:
df['speaker'].fillna('unkown', inplace=True)

In [None]:
df[df['speaker'].str.contains('Bush')]['speaker'].value_counts()

## Plotting Value Distributions

In [None]:
df['length'].plot(kind='box', vert=False, figsize=(8, 1))

In [None]:
df['length'].plot(kind='hist', bins=30, figsize=(8,2))

In [None]:
# Not in book: seaborn plot with gaussian kernel density estimate
import seaborn as sns

plt.figure(figsize=(8, 2))
sns.distplot(df['length'], bins=30, kde=True);

## Comparing Value Distributions across Categories


In [None]:
where = df['country'].isin(['USA', 'FRA', 'GBR', 'CHN', 'RUS'])
g = sns.catplot(data=df[where], x="country", y="length", kind='box')
g.fig.set_size_inches(4, 3) ###
g.fig.set_dpi(100) ###
g = sns.catplot(data=df[where], x="country", y="length", kind='violin')
g.fig.set_size_inches(4, 3) ###
g.fig.set_dpi(100) ###

## Visualizing Developments over Time


In [None]:
df.groupby('year').size().plot(title="Number of Countries", figsize=(6,2))

In [None]:
df.groupby('year').agg({'length': 'mean'}) \
  .plot(title="Avg. Speech Length", ylim=(0,30000), figsize=(6,2))