# Notebook 1

The cell below loads the required packages and initializes a string specifying the location of the **Large Movie Review Dataset**. It assumes that the data is stored in the parent folder of this notebook file within a folder named _movie_reviews_data_. You might have to change the string in the variable _data_folder_ to match the folder structure you created when downloading the files. 

In [6]:
# LOAD PACKAGES
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

porter_stemmer = PorterStemmer()
import os

data_folder = "../movie_reviews_data"

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hannoreuvers/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Movie review examples

In [2]:
# Specify files to display
filename_positive_review = "3247_10.txt"
filename_negative_review = "2331_1.txt"

# Read reviews and print
with open(os.path.join(data_folder, "aclImdb", "train", "pos", filename_positive_review), 'r') as file:
    selected_pos_review = file.read()
with open(os.path.join(data_folder, "aclImdb", "train", "neg", filename_negative_review), 'r') as file:
    selected_neg_review = file.read()
print("Selected positive review:\n", selected_pos_review, "\n")
print("Selected negative review:\n", selected_neg_review)


Selected positive review:
 I don't know why I like this movie so well, but I never get tired of watching it. 

Selected negative review:
 I wouldn't rent this one even on dollar rental night.


### Print list of stopwords

In [3]:
english_stopwords = nltk.corpus.stopwords.words('english')
print("Number of stop words:", len(english_stopwords))
for stop_w in english_stopwords:
    print(f"- {stop_w}")

Number of stop words: 179
- i
- me
- my
- myself
- we
- our
- ours
- ourselves
- you
- you're
- you've
- you'll
- you'd
- your
- yours
- yourself
- yourselves
- he
- him
- his
- himself
- she
- she's
- her
- hers
- herself
- it
- it's
- its
- itself
- they
- them
- their
- theirs
- themselves
- what
- which
- who
- whom
- this
- that
- that'll
- these
- those
- am
- is
- are
- was
- were
- be
- been
- being
- have
- has
- had
- having
- do
- does
- did
- doing
- a
- an
- the
- and
- but
- if
- or
- because
- as
- until
- while
- of
- at
- by
- for
- with
- about
- against
- between
- into
- through
- during
- before
- after
- above
- below
- to
- from
- up
- down
- in
- out
- on
- off
- over
- under
- again
- further
- then
- once
- here
- there
- when
- where
- why
- how
- all
- any
- both
- each
- few
- more
- most
- other
- some
- such
- no
- nor
- not
- only
- own
- same
- so
- than
- too
- very
- s
- t
- can
- will
- just
- don
- don't
- should
- should've
- now
- d
- ll
- m
- o
-

In [21]:
print(f"Original review: {selected_pos_review}")

# Step 1: Remove punctuation
review_no_punctuation = re.sub(r'[".,!?;-]+', '', selected_pos_review)
print(f"Review without pucntuation: {review_no_punctuation}")

# Step 2: Tokenize string
review_tokens = nltk.word_tokenize(review_no_punctuation)
print(f"Output of nltk.word_tokenize: {review_tokens}")

# Step 3: Enforce lower case and omit non-text
lower_case_characters_only = [ ch.lower() for ch in review_tokens if ch.isalpha()]
print(f"Omit strings containing non-text and set to lower case: {lower_case_characters_only}")

# Step 4: Remove stop words
output_word_list = [word for word in lower_case_characters_only if word not in english_stopwords]
print(f"Final result having removed stop words: {output_word_list}")

def process_imdb_review_for_Bayes(imdb_review, stopwords, print_details = False):
    # Step 1: Remove all punctuation
    temp_data = re.sub(r'[".,!?;-]+', '', imdb_review)
    # Step 2: Tokenize
    temp_data = nltk.word_tokenize(temp_data)
    # Step 3: All lower case and omit non-text
    temp_data = [ ch.lower() for ch in temp_data if ch.isalpha()]
    # Step 4: Remove stop words
    output_for_Bayes = [word for word in temp_data if word not in english_stopwords]

    if print_details:
        print("\nORIGINAL: ", imdb_review)
        print("RESULT AFTER PRE-PROCESSING:", output_for_Bayes, "\n")

    return output_for_Bayes 


process_imdb_review_for_Bayes(selected_pos_review, english_stopwords, print_details=True)

Original review: I don't know why I like this movie so well, but I never get tired of watching it.
Review without pucntuation: I don't know why I like this movie so well but I never get tired of watching it
Output of nltk.word_tokenize: ['I', 'do', "n't", 'know', 'why', 'I', 'like', 'this', 'movie', 'so', 'well', 'but', 'I', 'never', 'get', 'tired', 'of', 'watching', 'it']
Omit strings containing non-text and set to lower case: ['i', 'do', 'know', 'why', 'i', 'like', 'this', 'movie', 'so', 'well', 'but', 'i', 'never', 'get', 'tired', 'of', 'watching', 'it']
Final result having removed stop words: ['know', 'like', 'movie', 'well', 'never', 'get', 'tired', 'watching']

ORIGINAL:  I don't know why I like this movie so well, but I never get tired of watching it.
RESULT AFTER PRE-PROCESSING: ['know', 'like', 'movie', 'well', 'never', 'get', 'tired', 'watching'] 



['know', 'like', 'movie', 'well', 'never', 'get', 'tired', 'watching']

### Function

In [None]:
tokenized_review = nltk.word_tokenize(selected_neg_review)


output_for_Bayes= [porter_stemmer.stem(word) for word in tokenized_review]

print(output_for_Bayes)


In [None]:
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Dates
startDate = datetime.strptime("1990-01-01", "%Y-%m-%d")
endDate = datetime.strptime("1993-12-31", "%Y-%m-%d")
dateList = pd.date_range(startDate, endDate, freq='D')

# Import count data from csv
asthmaData = pd.read_csv("data/asthma_data.csv")
asthmaData["time"] = dateList

# Create Figure 1
figure1 = sns.lineplot(data=asthmaData, x="time", y="Count")
figure1.set_xticks(["1990-01-01", "1991-01-01", "1992-01-01", "1993-01-01", "1994-01-01"], labels=[1990, 1991, 1992, 1993, 1994])
plt.show()

In [None]:
from dateutil.relativedelta import relativedelta

y = asthmaData["Count"]
dates = asthmaData["time"]
print(y)
print(dates)


dateString = "1993-05-31"

intervalEndDate = datetime.strptime(dateString, "%Y-%m-%d")
intervalStartDate = intervalEndDate - relativedelta(days = 365)


print(intervalEndDate)
print(intervalStartDate)

temp = (intervalStartDate<=dates) & (dates<=intervalEndDate)
y[temp]

print("Say something: ")
print(y[temp].mean())








In [None]:
# Imports
from datetime import timedelta
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Plot settings
sns.set_style("darkgrid")

def remove_outlier(date_string):
    TIME_DELTA = 5
    # Cast input date_string to datetime
    input_date = pd.to_datetime(date_string)
    # Before dat_string period
    start_date1 = input_date - timedelta(days = TIME_DELTA)
    end_date1 = input_date - timedelta(days = 1)
    # After date_string time period
    start_date2 = input_date + timedelta(days = 1)
    end_date2 = input_date + timedelta(days = TIME_DELTA)
    # Calculate average counts 
    sum1 = BostonLibData.loc[(start_date1 <= BostonLibData["date_col"]) & (BostonLibData["date_col"] <= end_date1)]["usage"].sum()
    sum2 = BostonLibData.loc[(start_date2 <= BostonLibData["date_col"]) & (BostonLibData["date_col"] <= end_date2)]["usage"].sum()
    new_value = round( (sum1+sum2)/(2*TIME_DELTA))
    # Assign new value
    outlier_loc = BostonLibData.index[BostonLibData['date_col']==input_date].tolist()
    BostonLibData.loc[outlier_loc, "usage"] = new_value





BostonLibData = pd.read_csv("data/asthma_data.csv")


#remove_outlier('2019-11-01')
#remove_outlier('2021-07-17')


print(BostonLibData)


input_date = pd.to_datetime('2018-01-19')



#print(start_date1)
#print(end_date1)



#sum1 = BostonLibData.loc[(start_date1 <= BostonLibData["date_col"]) & (BostonLibData["date_col"] <= end_date1)]["usage"].sum()
#sum2 = BostonLibData.loc[(start_date2 <= BostonLibData["date_col"]) & (BostonLibData["date_col"] <= end_date2)]["usage"].sum()
#print(round(sum1+sum2))
#print(pd.to_datetime('2017-01-19')+timedelta(days=5))

#print(BostonLibData.where(BostonLibData["timestamp"] =='2017-01-19'))

# Create scatterplot
fig1 = sns.lineplot(data=BostonLibData, x="date_col", y="usage", palette=["C0"])
#sns.move_legend(fig1, "upper left", bbox_to_anchor=(1, 1))

plt.show()




## Plot Poisson pmfs
Plot the pmf of a Poisson distributed random variables for *x*=0, 1, ..., *xmax* for all the parameter values in *mulist*.

In [None]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import poisson
import seaborn as sns
sns.set_style("darkgrid")
 
# Calculate probabilities
xmax = 50
mulist = [1, 10, 25]
result_df = pd.DataFrame(columns=["x", "probability", "mu"])
for mu in mulist:
    x = np.arange(0, xmax, 1)
    prob = poisson.pmf(x, mu)
    mu_vector = np.full((xmax,), mu)
    if result_df.empty:
        result_df = pd.DataFrame({"x": x, "probability": prob, "mu": mu_vector}).copy()
    else:
        result_df = pd.concat([result_df, pd.DataFrame({"x": x, "probability": prob, "mu": mu_vector})])

# Create scatterplot
fig2 = sns.scatterplot(data=result_df, x="x", y="probability", hue="mu", palette=["C0", "C1", "C2"])
sns.move_legend(fig2, "upper left", bbox_to_anchor=(1, 1))
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
s = np.random.poisson(5, 1000)

plt.hist(s,14, density=True)
plt.show()



In [None]:
class PoissonModel:
    """
    The Poisson model documentation goes here!
    

    Attributes
    ------------
    

    Methods
    -------
    
    """

    __modelType = "Count Data"

    def __init__(self, dates, X, y):
        """
        Parameters
        ----------
        dates : str
            The name of the animal
        X : Numpy object Pandas dataframe of dimension (Txp)
            A matrix with the the 
        y : Pandas dataframe of dimension (Tx1)
            The number of legs the animal (default is 4)
        """
        self.__dates = dates
        self.__X = X
        self.__y = y


    def __str__(self):
        return "Poisson model \n Start date: \n End date: Sample size: \n"


class Dog:
    """Let me write something here."""
    species = "Canis familiaris"

    def __init__(self, name, age):
        self.__name = name
        self.__age = age

    def __str__(self):
        return "My dog is named "+self.__name + " and is "+str(self.__age)+" years old.\nHave fun!"

miles = Dog("Miles", 4)


PoissonModel()

print(miles.species)
print(miles)