 # Preparation
 ## Install & import modules

In [None]:
! pip install seaborn


In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns


 ## Read remote dataset

 The data is in this git repository: [ML-boot-camp/ratebeer.git](https://github.com/ML-boot-camp/ratebeer.git).

 The data is located in the `ratebeer/data/` folder.


In [None]:
file_url = "https://github.com/ML-boot-camp/ratebeer/raw/master/data/ratebeer_sample_clean.parquet"


In [None]:
df_ratebeer = pd.read_parquet(file_url)


In [None]:
df_reviewers = (
    df_ratebeer
    .groupby("review_profileName")
    .agg(
        number_of_reviews=('review_profileName', 'count'),
        average_rating=('review_overall', 'mean')
    )
    .round(1)
    .reset_index()
)


In [None]:
df_master = (
    df_ratebeer
    .merge(
        df_reviewers,
        on="review_profileName",
        how='inner',
        validate="m:1"
    )
    .assign(
        review_time=lambda df: df.review_time.astype(int)
        .apply(pd.Timestamp.fromtimestamp)
    )
    .assign(
        positive_review=lambda df: (df.review_overall >= df.review_overall.median()).astype(int)
    )
)


 # `df_master` DataFrame

 ## General information
 Have a first overview of the dataframe size, i.e. number of rows & columns.

 Methods you'll need:
 - [`pd.DataFrame.shape`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.shape.html)

In [None]:
df_master.shape


 Get a few information about the content of the dataframe:
 - number of null values per column
 - data type of each column
 - memory usage

 Methods you'll need:
 - [`pd.DataFrame.isnull`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isnull.html)
 - [`pd.DataFrame.sum`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sum.html)
 - [`pd.DataFrame.dtypes`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dtypes.html)
 - [`pd.DataFrame.info`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.info.html)

In [None]:
df_master.isnull().sum()


In [None]:
df_master.dtypes


In [None]:
df_master.info(memory_usage="deep")


 Show a sample of the data

 Methods you'll need:
 - [`pd.DataFrame.head`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html)
 - [`pd.DataFrame.sample`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html)

 Bonus: display the transpose of the dataframe for better readability when having lots of columns using:
 - [`pd.DataFrame.T`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.T.html)

In [None]:
df_master.head(5)


In [None]:
df_master.sample(5).T


 Compute statistics to understand the content of each column.

 Methods you'll need:
 - [`pd.DataFrame.describe`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html)
 - [`pd.Series.hist`](https://pandas.pydata.org/docs/reference/api/pandas.Series.hist.html)
 - [`pd.DataFrame.fillna`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html)

 Bonus: fill NaN values with an empty string `""` for a better readability using:

In [None]:
df_master.describe(include="all").fillna("").T


 ## Quantitative variables

 `review_overall`

 all numeric columns containing reviews: review_* & average_rating

In [None]:
review_columns = [
    "review_appearance",
    "review_aroma",
    "review_palate",
    "review_taste",
    "review_overall",
    "average_rating",
]
HISTOGRAM_SIZE = (6, 3)
(
    (df_master)
    .loc[:, review_columns]
    .plot.hist(
        bins=range(21),
        subplots=True,
        figsize=(HISTOGRAM_SIZE[0], len(review_columns) * HISTOGRAM_SIZE[1]),
    )
)


 `number_of_reviews`

In [None]:
(
    (df_master)
    .number_of_reviews
    .plot.hist(bins=100)
)


In [None]:
(
    (df_master)
    .number_of_reviews
    .plot.hist(bins=100, loglog=True)
)


 If interested, you can read: [Zipf's Law on Wikipedia](https://en.wikipedia.org/wiki/Zipf's_law)

 `beer_ABV`

In [None]:
(
    (df_master)
    .beer_ABV
    .plot.hist(bins=100)
)


In [None]:
(
    (df_master)
    .beer_ABV
    .plot.hist(bins=100, logy=True)
)


 `review_time` column

In [None]:
(
    (df_master)
    .review_time
    .describe()
)


In [None]:
(
    df_master
    .review_time
    .hist(bins=100)
)


In [None]:
(
    df_master
    # .review_time
    .pipe(sns.histplot, x="review_time", bins=100, hue="positive_review")
)


 ## Nominal and ordinal variables:
 - `positive_review`
 - `beer_style`
 - `beer_name`
 - `beer_beerId`
 - `beer_brewerId`
 - `review_profileName`

 `positive_review`

In [None]:
(
    (df_master)
    .positive_review
    .describe()
)


In [None]:
(
    (df_master)
    .positive_review
    .value_counts()
    .plot.bar()
)


 `beer_style`

In [None]:
(
    (df_master)
    .beer_style
    .describe()
)


In [None]:
(
    (df_master)
    .beer_style
    .value_counts()
    .plot.bar()
)


In [None]:
(
    (df_master)
    .beer_style
    .value_counts()
    .plot.bar(logy=True)
)


 `beer_name`

In [None]:
(
    (df_master)
    .beer_name
    .describe()
)


In [None]:
(
    (df_master)
    .beer_name
    .value_counts()
    .value_counts()
    .plot.bar()
)


In [None]:
(
    (df_master)
    .beer_name
    .value_counts()
    .value_counts()
    .plot.bar(logy=True)
)


In [None]:
(
    (df_master)
    .beer_name
    .value_counts()
    .value_counts()
    .plot(loglog=True, marker=".")
)


 `beer_beerId`

In [None]:
(
    (df_master)
    .beer_beerId
    .describe()
)


In [None]:
(
    (df_master)
    .beer_beerId
    .value_counts()
    .value_counts()
    .plot.bar()
)


In [None]:
(
    (df_master)
    .beer_beerId
    .value_counts()
    .value_counts()
    .plot.bar(logy=True)
)


 `beer_brewerId`

In [None]:
(
    (df_master)
    .beer_brewerId
    .describe()
)


In [None]:
(
    (df_master)
    .beer_brewerId
    .value_counts()
    .value_counts()
    .plot.bar()
)


In [None]:
(
    (df_master)
    .beer_brewerId
    .value_counts()
    .value_counts()
    .plot.bar(logy=True)
)


In [None]:
(
    (df_master)
    .beer_brewerId
    .value_counts()
    .value_counts()
    .plot(loglog=True, marker=".")
)


 `review_profileName`

In [None]:
(
    (df_master)
    .review_profileName
    .describe()
)


In [None]:
(
    (df_master)
    .review_profileName
    .value_counts()
    .value_counts()
    .plot.bar()
)


In [None]:
(
    (df_master)
    .review_profileName
    .value_counts()
    .value_counts()
    .plot.bar(logy=True)
)


In [None]:
(
    (df_master)
    .review_profileName
    .value_counts()
    .value_counts()
    .plot(loglog=True, marker=".")
)


 Plot the histogram of the `review_time` column

 Hint:
 - [`pd.Series.hist`](https://pandas.pydata.org/docs/reference/api/pandas.Series.hist.html)

 `review_overall`

In [None]:
(
    (df_master)
    .pipe(sns.histplot, x="review_overall", bins=range(21), hue="positive_review")
)


 `review_appearance`

In [None]:
(
    (df_master)
    .pipe(sns.histplot, x="review_appearance", bins=range(21), hue="positive_review")
)


 `review_aroma`

In [None]:
(
    (df_master)
    .pipe(sns.histplot, x="review_aroma", bins=range(21), hue="positive_review")
)


 `review_palate`

In [None]:
(
    (df_master)
    .pipe(sns.histplot, x="review_palate", bins=range(21), hue="positive_review")
)


 `review_taste`

In [None]:
(
    (df_master)
    .pipe(sns.histplot, x="review_taste", bins=range(21), hue="positive_review")
)


 `beer_style`

In [None]:
(
    (df_master)
    .pipe(sns.histplot, x="beer_style", bins=range(21), hue="positive_review")
)


 ## High cardinality variables

In [None]:
(
    (df_master)
    .sample(10000)
    .assign(beer_beerId_noccurences=lambda df: df.beer_beerId.pipe(lambda s: s.replace(s.value_counts().to_dict())))
    .pipe(sns.histplot, x="beer_beerId_noccurences", bins=range(21), hue="positive_review")
)


In [None]:
(
    (df_master)
    .sample(10000)
    .assign(beer_brewerId_noccurences=lambda df: df.beer_brewerId.pipe(lambda s: s.replace(s.value_counts().to_dict())))
    .pipe(sns.histplot, x="beer_brewerId_noccurences", bins=range(21), hue="positive_review")
)

In [None]:
(
    (df_master)
    .sample(10000)
    .assign(review_profileName_noccurences=lambda df: df.review_profileName.pipe(lambda s: s.replace(s.value_counts().to_dict())))
    .pipe(sns.histplot, x="review_profileName_noccurences", bins=range(21), hue="positive_review")
)


 ## Multivariate plots

 Plot a scatter matrix of the numerical variables, colored by the target column
 `positive_review`.

 Hint:
 - [`pd.DataFrame.select_dtypes`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html)
 - [`pd.plotting.scatter_matrix`](https://pandas.pydata.org/docs/reference/api/pandas.plotting.scatter_matrix.html)

In [None]:
review_columns = [
    "review_appearance",
    "review_aroma",
    "review_palate",
    "review_taste",
    "review_overall",
]

def add_jitter(df, jitter=0.4):
    return df + np.random.uniform(low=-jitter, high=jitter, size=df.shape)

(
    (df_master)
    .loc[:, review_columns]
    .head(10000)
    .pipe(add_jitter)
    .pipe(
        pd.plotting.scatter_matrix,
        figsize=(15, 15),
        s=10,
        alpha=0.1,
        c=df_master.positive_review.head(10000),
    )
)


 ## String manipulation
 Using the [`pd.Series.str`](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.html) API

 ### `review_text` column:

 Compute the length of the texts in the dataset.

 Methods you'll need:
 - [`pd.Series.str.len`](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.len.html)
 - [`pd.Series.str`](https://pandas.pydata.org/docs/reference/api/pandas.Series.str..html)

 Bonus: plot an histogram of the values, with log values, using:
 - [`pd.Series.plot.hist`](https://pandas.pydata.org/docs/reference/api/pandas.Series.plot.bar.html)

 Is it a Power law distribution ?

In [None]:
(
    (df_master.review_text)
    .str.len()
    .plot.hist(bins=range(2000))
)


In [None]:
(
    (df_master.review_text)
    .str.len()
    .plot.hist(bins=range(2000), logy=True)
)


 Compute the frequency of the most used letters in the texts

 Methods you'll need:
 - [`pd.Series.str.lower`](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.lower.html)
 - [`pd.Series.str.split`](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.split.html)
 - [`pd.Series.explode`](https://pandas.pydata.org/docs/reference/api/pandas.Series.explode.html)
 - [`pd.Series.value_counts`](https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html)
 - [`pd.Series.head`](https://pandas.pydata.org/docs/reference/api/pandas.Series.head.html)

 Bonus: plot an histogram of the values, with log values, using:
 - [`pd.Series.plot.hist`](https://pandas.pydata.org/docs/reference/api/pandas.Series.plot.bar.html)

 Is it a Power law distribution ?

In [None]:
df_most_used_letters = (
    (df_master.review_text)
    .str.lower()
    .str.split("")
    .explode()
    .loc[lambda x: x != " "]
    .value_counts()
)


In [None]:
(
    df_most_used_letters
    .head(40)
    .plot.bar()
)


In [None]:
(
    df_most_used_letters
    .head(40)
    .plot.bar(logy=True)
)


 Compute the frequency of the most used words in the texts

 Methods you'll need:
 - [`pd.Series.str.len`](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.len.html)
 - [`pd.Series.str.split`](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.split.html)
 - [`pd.Series.explode`](https://pandas.pydata.org/docs/reference/api/pandas.Series.explode.html)
 - [`pd.Series.value_counts`](https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html)
 - [`pd.Series.head`](https://pandas.pydata.org/docs/reference/api/pandas.Series.head.html)

 Bonus: plot an histogram of the values, with log values, using:
 - [`pd.Series.plot.hist`](https://pandas.pydata.org/docs/reference/api/pandas.Series.plot.bar.html)

 Is it a Power law distribution ?

In [None]:
word_frequencies = (
    (df_master.review_text)
    # .head(50000)
    .str.lower()
    .str.replace(r"[^a-z\ ]", "")
    .str.replace(r"\ +", " ")
    .str.split(" ")
    .explode()
    .value_counts()
)
word_frequencies


In [None]:
(
    word_frequencies
    .head(100)
    .plot.bar(figsize=(12, 4))
)


In [None]:
(
    word_frequencies
    .head(100)
    .plot.bar(logy=True, figsize=(12, 4))
)


In [None]:
(
    word_frequencies
    .head(1000)
    .reset_index(drop=True)
    .plot(loglog=True, marker=".")
)


 ## Detailed text analysis
 Word associated to positive & negative reviews

In [None]:
(
    df_master
    .head(100000)
    .assign(
        tokenized_text=lambda df: (df.review_text)
        .str.lower()
        .str.replace(r"[^a-z]", " ")
        .str.replace(r" +", " ")
        .str.split(" ")
    )
    .loc[:, ["review_overall", "tokenized_text"]]
    .explode("tokenized_text")
    .groupby("tokenized_text", as_index=False)
    .agg(["mean", "count"])
    .reset_index()
    .sort_values(by=("review_overall", "count"), ascending=False)
    .head(200)
    .style.background_gradient(cmap="RdYlGn")
)

 Count the occurences of each day of the week in `date` & plot a bar diagram,
 using the `dt` (datetime) pandas API.

 Hint:
 - [`pd.Series.astype`](https://pandas.pydata.org/docs/reference/api/pandas.Series.astype.html)
 - [`pd.Series.dt.dayofweek`](https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.dayofweek.html)
 - [`pd.Series.value_counts`](https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html)
 - [`pd.Series.sort_index`](https://pandas.pydata.org/docs/reference/api/pandas.Series.sort_index.html)
 - [`pd.Series.plot.bar`](https://pandas.pydata.org/docs/reference/api/pandas.Series.plot.bar.html)

In [None]:
(
    (df_master.review_time)
    .dt.dayofweek
    .value_counts()
    .sort_index()
    .plot.bar()
)


 Count the percentage of each rating as a function of the year in `date` & plot
 a line diagram. E.g: in 2020, 55% of ratings were 5, 15% or ratings were 4, ...

 Hint:
 - [`pd.Series.astype`](https://pandas.pydata.org/docs/reference/api/pandas.Series.astype.html)
 - [`pd.Series.dt.year`](https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.year.html)
 - [`pd.DataFrame.pivot_table`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pivot_table.html)
 - [`pd.DataFrame.apply`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html)

In [None]:
(
    df_master
    .assign(
        year=lambda df: df.review_time
        .dt.year
    )
    .pipe(sns.displot, x="review_time", hue="review_overall", multiple="fill", kind="kde")
)
