### Import modules

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools

from tqdm import tqdm, trange

import fasttext
from sentence_transformers import SentenceTransformer

from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor

### Load jokes' ratings

In [8]:
# Load and combine (concat) ratings from 3 different files
Y = pd.concat(pd.read_excel(f"../data/jokes_ratings_{i}.xls", header=None) for i in range(1, 4))

# Take all rows and all columns starting from the second one (first one gives the number of jokes rated by that user) and convert each `99` value to `NaN` because the value `99` corresponds to "null" = "not rated"
Y = Y.iloc[:, 1:].replace(99, float("NaN"))

# Print the first 10 rows of the dataframe
print("First 10 rows of the DataFrame:")
print(Y.head(10))

# Print the shape of the dataframe
print("\nShape of the DataFrame:")
print(Y.shape)

First 10 rows of the DataFrame:
    1     2     3     4     5     6     7     8     9     10   ...   91   \
0 -7.82  8.79 -9.66 -8.16 -7.52 -8.50 -9.85  4.17 -8.98 -4.76  ...  2.82   
1  4.08 -0.29  6.36  4.37 -2.38 -9.66 -0.73 -5.34  8.88  9.22  ...  2.82   
2   NaN   NaN   NaN   NaN  9.03  9.27  9.03  9.27   NaN   NaN  ...   NaN   
3   NaN  8.35   NaN   NaN  1.80  8.16 -2.82  6.21   NaN  1.84  ...   NaN   
4  8.50  4.61 -4.17 -5.39  1.36  1.60  7.04  4.61 -0.44  5.73  ...  5.19   
5 -6.17 -3.54  0.44 -8.50 -7.09 -4.32 -8.69 -0.87 -6.65 -1.80  ... -3.54   
6   NaN   NaN   NaN   NaN  8.59 -9.85  7.72  8.79   NaN   NaN  ...   NaN   
7  6.84  3.16  9.17 -6.21 -8.16 -1.70  9.27  1.41 -5.19 -4.42  ...  7.23   
8 -3.79 -3.54 -9.42 -6.89 -8.74 -0.29 -5.29 -8.93 -7.86 -1.60  ...  4.37   
9  3.01  5.15  5.15  3.01  6.41  5.15  8.93  2.52  3.01  8.16  ...   NaN   

    92    93    94    95    96    97    98    99    100  
0   NaN   NaN   NaN   NaN   NaN -5.63   NaN   NaN   NaN  
1 -4.95 -0.29  

### Calculate mean rating for each joke

In [9]:
Y = Y.mean()

# Describe converted dataframe
print("DataFrame description:")
print(Y.describe())

# Print the head of the dataframe
print("\nHead of the DataFrame:")
print(Y.head())

# Print the shape of the dataframe
print("\nShape of the DataFrame:")
print(Y.shape)

DataFrame description:
count    100.000000
mean       0.702018
std        1.490444
min       -3.704541
25%       -0.416897
50%        0.996985
75%        1.793516
max        3.362593
dtype: float64
Head of the DataFrame:
1    0.901997
2    0.162989
3    0.193411
4   -1.412599
5    0.235352
dtype: float64

Shape of the DataFrame:
(100,)


### Load jokes from html files

In [23]:
jokes = []

for i in range(1, 101):
    with open(f"../data/jokes/init{i}.html", "r") as file:
        soup = BeautifulSoup(file.read(), 'html.parser')
        extracted_text = ' '.join(itertools.islice(soup.stripped_strings, 1, None))
        text_no_long_whitespaces = ' '.join(extracted_text.split())
        final_joke = text_no_long_whitespaces.replace('Q.', '').replace('A.', '')
        jokes.append(final_joke.strip())

In [24]:
print("Example jokes:")

for i, joke in enumerate(jokes[:10]):
    print(f"[{i}] : {joke}")

Example jokes:
[0] : A man visits the doctor. The doctor says "I have bad news for you.You have cancer and Alzheimer's disease". The man replies "Well,thank God I don't have cancer!"
[1] : This couple had an excellent relationship going until one day he came home from work to find his girlfriend packing. He asked her why she was leaving him and she told him that she had heard awful things about him. "What could they possibly have said to make you move out?" "They told me that you were a pedophile." He replied, "That's an awfully big word for a ten year old."
[2] : What's 200 feet long and has 4 teeth?  The front row at a Willie Nelson Concert.
[3] : What's the difference between a man and a toilet?  A toilet doesn't follow you around after you use it.
[4] : What's O. J. Simpson's Internet address?  Slash, slash, backslash, slash, slash, escape.
[5] : Bill & Hillary are on a trip back to Arkansas. They're almost out of gas, so Bill pulls into a service station on the outskirts of town. 