# Notebook 2

**NOTE:** The cell below loads the required packages and retrieves the location of the **Large Movie Review Dataset** from a configuration file. After downloading the data, the parent folder should be specified. Please open the *template_config_file.json*, insert the path on your computer with the downloaded data, and save this json file as *config_file.json*. All notebooks will fetch the path from this file, so it has to be provided only once.

In [None]:
# LOAD PACKAGES
import json
import nltk
nltk.download('punkt')
from scipy.stats import beta
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import plotly.express as px
import re

porter_stemmer = PorterStemmer()
import os

with open('../config/config_file.json') as f:
    config_file = json.load(f)

# Read review data folder from configuration file
movie_reviews_folder = config_file['movie_review_location']
print(f"Movie reviews will be loaded from: {movie_reviews_folder}")

### Prior and posterior distribution for Example 2.1

In [None]:
# Grid of x values
theta_values = np.linspace(0, 1, 51)

# Prior distribution
alpha_prior = 1
beta_prior = 1
prior_df = pd.DataFrame()
prior_df["theta"] = theta_values
prior_df["Prior distribution"] = beta.pdf(theta_values, alpha_prior, beta_prior)

fig_prior = px.line(prior_df, x="theta", y="Prior distribution")
fig_prior.update_yaxes(range=[0, 3])
fig_prior.update_traces(line={'width': 5})
fig_prior.update_layout(font=dict(size=20), xaxis = dict(tickfont = dict(size=20)), yaxis = dict(tickfont = dict(size=20)))
fig_prior.show()

# Posterior distribution
coin_tosses_seq = "HHTHTTHHHT"
n = len(coin_tosses_seq)
n_H = coin_tosses_seq.count('H')

print(f"Sequence length: {n}")
print(f"Number of heads: {n_H}")

alpha_posterior = n_H+alpha_prior
beta_posterior = n-n_H+beta_prior

posterior_df = pd.DataFrame()
posterior_df["theta"] = theta_values
posterior_df["Posterior distribution"] = beta.pdf(theta_values, alpha_posterior, beta_posterior)

fig_posterior = px.line(posterior_df, x="theta", y="Posterior distribution")
fig_posterior.update_yaxes(range=[0, 3])
fig_posterior.update_traces(line={'width': 5})
fig_posterior.update_layout(font=dict(size=20), xaxis = dict(tickfont = dict(size=20)), yaxis = dict(tickfont = dict(size=20)))
fig_posterior.show()

