In [1]:
import os
import pandas as pd 
import re
import numpy as np

In [8]:
PREPROCESSED_DATA_PATH = '../data/preprocessed_data'

In [9]:
df_jokes = pd.read_csv(os.path.join(PREPROCESSED_DATA_PATH, 'jokes' ,'jokes.csv'))

___

## Preprocess Jokes

In [10]:
def remove(text):
    cleaned_text = re.sub(r'<[^>]+>', '', text)
    cleaned_text = re.sub(r'^\d+\s*', '', cleaned_text, flags=re.MULTILINE)  
    cleaned_text = cleaned_text.replace('&nbsp;', ' ').replace('\t', ' ').replace('\n', '')
    return cleaned_text.strip()

In [11]:
df_jokes['joke'] = df_jokes['joke'].apply(remove)

In [12]:
df_jokes.iloc[1].values[0]

'This couple had an excellent relationship going until one day he came homefrom work to find his girlfriend packing. He asked her why she was leaving himand she told him that she had heard awful things about him. "What could they possibly have said to make you move out?" "They told me that you were a pedophile." He replied, "That\'s an awfully big word for a ten year old."'

In [13]:
df_jokes.to_csv(os.path.join(PREPROCESSED_DATA_PATH, 'jokes' ,'jokes_preprocessed.csv'), index=False)

---

## Preprocess ratings

In [14]:
df_ratings = pd.read_csv(os.path.join(PREPROCESSED_DATA_PATH, 'ratings' ,'ratings.csv'))

First column in the jokes count rated by that User

In [15]:
df_ratings.rename(columns={'0': 'Jokes_Count'}, inplace=True)

In [16]:
df_ratings.head(5)

Unnamed: 0,Jokes_Count,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


Wide to long dataframe

In [17]:
ratings = df_ratings.reset_index().rename(columns={'index': 'user'})
ratings.drop('Jokes_Count', axis=1, inplace=True)
ratings = pd.melt(ratings, id_vars=['user'], value_vars=[str(i) for i in range(1, 101)], var_name='joke_id', value_name='rating')

ratings['joke_id'] = pd.to_numeric(ratings['joke_id'])

ratings = ratings.sort_values(by=['user', 'joke_id'])
ratings.reset_index(drop=True, inplace=True)


Shape matches. We have 73421 users and each has 100 jokes assigned.

$73421 \times 100 = 7342100$

In [18]:
ratings.shape

(7342100, 3)

In [19]:
ratings[ratings.duplicated()]

Unnamed: 0,user,joke_id,rating


In [20]:
ratings[ratings.isnull().any(axis=1)] 

Unnamed: 0,user,joke_id,rating


According to dataset description, 
99 = Null

In [21]:
ratings['rating'].replace(99.00, np.nan, inplace=True)
ratings.dropna(subset=['rating'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ratings['rating'].replace(99.00, np.nan, inplace=True)


In [22]:
ratings.sample(5)

Unnamed: 0,user,joke_id,rating
1421235,14212,36,0.83
767613,7676,14,-1.31
5779164,57791,65,2.43
2351734,23517,35,1.26
1121255,11212,56,7.43


In [23]:
ratings.shape

(4136360, 3)

In [24]:
print(f'Max Rating: {ratings["rating"].max()} | Min Rating: {ratings["rating"].min()}')

Max Rating: 10.0 | Min Rating: -9.95


In [25]:
ratings.to_csv(os.path.join(PREPROCESSED_DATA_PATH, 'ratings' ,'ratings_preprocessed_ml.csv'), index=False)