In [None]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# import data
data = pd.read_csv(
    '../data/external/preprocessed.csv',
    dtype_backend='pyarrow',
)

In [None]:
data['date'] = pd.to_datetime(data['date'], unit='s')
data['score'] = data['score'].astype('int64[pyarrow]')
data.head()

In [None]:
# Check the data types
data.info()

In [None]:
# Remve the rows with missing values
data = data.dropna()

In [None]:
# Check summary statistics
data.describe() 

In [None]:
# remove jokes before 2016
data = data[data['date'] > '2016-01-01']

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot((data['score']+1), bins=50, color='skyblue', kde=True, log_scale=True) # Log +1 transformation
plt.title('Distribution of Scores')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Plot the distribution of scores (95th percentile)
# Calculate the 95th percentile limit
limit = data['score'].quantile(0.95)

# Create a subset of your data up to the 95th percentile
data_95 = data[data['score'] <= limit]

# Plot the distribution of scores (95th percentile)
plt.figure(figsize=(10, 6))
sns.histplot(data_95['score'], bins=100, color='skyblue', kde=True)
plt.title('Distribution of Scores (95th Percentile)')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
import re
import string
from joblib import Parallel, delayed

# List of offensive words to filter out
offensive_words = [
    "nigger", "kike", "faggot", "retard", "sperg",
    "tranny", "trannie", "shemale", "shim", "sodomite",
    "cunt", "whore", "dyke",
    "spic", "chink", "gook", "wetback", "beaner",
    "coon", "jigaboo", "porch monkey", "tar baby",
    "raghead", "towelhead",
    "fag", "homo", "queer", "lesbo", "pansy",
     "cripple", "mongoloid",
    "rape", "molest", "pedophile", "child molester",
    "jihadi", "lardass", "anorexic", "bulimic",
    "slave", "plantation", "massa", "lynch",
    "gas chamber", "holocaust", "nazi", "hitler",
    "kkk", "klan", "white supremacist",
    "suicide", "kill yourself", "self-harm", "cutting",
    "pro-anorexia", "pro-bulimia", "thinspo", "bonespo",
    "schizo", "gimp", "invalid", "gay",
]

def preprocess_jokes(df, joke_column='joke', llm=False):
    """
    Preprocess the jokes in a DataFrame for simple machine learning models.
    
    Args:
        df (pd.DataFrame): DataFrame containing the jokes.
        joke_column (str): The column name containing the jokes.
        
    Returns:
        pd.DataFrame: DataFrame with an additional 'cleaned_joke' column.
        int: Count of offensive jokes removed.
        int: Count of jokes removed due to insufficient length.
    """
    
    def clean_joke(joke):
        """
        Clean an individual joke.
        
        Args:
            joke (str): The joke to clean.
            
        Returns:
            str: Cleaned joke.
        """
        # Lowercase the joke
        joke = joke.lower()
        
        # Remove URLs and non-joke content
        joke = re.sub(r'http\S+', '', joke)  # Remove URLs
        joke = re.sub(r'\s+', ' ', joke).strip()  # Remove excessive whitespace and trim
        
        # Remove special characters and digits
        joke = joke.translate(str.maketrans('', '', string.punctuation + string.digits))
        
        return joke
    
    def is_offensive(joke):
            """
            Check if the joke contains any offensive words.
            
            Args:
                joke (str): The joke to check.
                
            Returns:
                bool: True if the joke contains offensive words, False otherwise.
            """

            joke_words = joke.split()
            return any(word.lower() in offensive_words for word in joke_words)
        
    offensive_count = 0
    length_count = 0
    
    def process_row(joke):
        nonlocal offensive_count, length_count
        
        if not isinstance(joke, str):
            return None
        
        cleaned = clean_joke(joke)
        
        if is_offensive(cleaned):
            offensive_count += 1
            return None
        
        if len(cleaned) <= 10:
            length_count += 1
            return None
        
        return cleaned
    
    # Skip lower case and clean jokes for LLM
    def process_row_llm(joke):
        nonlocal offensive_count, length_count
        
        joke = re.sub(r'\s+', ' ', joke).strip()

        if not isinstance(joke, str):
            return None
        
        if is_offensive(joke):
            offensive_count += 1
            return None
        
        if len(joke) <= 10:
            length_count += 1
            return None
        
        return joke
    
    if llm:
        df['cleaned_joke'] = Parallel(n_jobs=-1)(delayed(process_row_llm)(row) for row in df[joke_column])
    else:
        df['cleaned_joke'] = Parallel(n_jobs=-1)(delayed(process_row)(row) for row in df[joke_column])
    
    duplicate_count = df.duplicated(subset=['cleaned_joke']).sum()
    
    df = df.drop_duplicates(subset=['cleaned_joke'])

    # Drop rows with None in 'cleaned_joke' column
    df = df.dropna(subset=['cleaned_joke'])
    
    return df, offensive_count, length_count, duplicate_count

In [None]:
cleaned_jokes, offensive_count, length_count, duplicate_count = preprocess_jokes(data, 'joke', llm=True)

In [None]:
print(f"Cleaned Jokes: {len(cleaned_jokes)}")
print(f"Offensive Jokes Removed: {offensive_count}")
print(f"Jokes Removed Due to Length: {length_count}")
print(f"Duplicate Jokes Removed: {duplicate_count}")

In [None]:
cleaned_jokes.info()

In [None]:
# replace NA scores with 0
cleaned_jokes['score'] = cleaned_jokes['score'].astype('Int64[pyarrow]')
cleaned_jokes['score'] = cleaned_jokes['score'].fillna(0)

In [None]:
# Remove all jokes with 0 scores
cleaned_jokes = cleaned_jokes[cleaned_jokes['score'] != 0]

In [None]:
# Log transform the scores
cleaned_jokes['score'] = np.log1p(cleaned_jokes['score'])

In [None]:
limit = cleaned_jokes['score'].quantile(0.95)

data_95 = cleaned_jokes[cleaned_jokes['score'] <= limit]

# Plot the distribution of scores
plt.figure(figsize=(10, 6))
sns.histplot((data_95['score']), bins=50, color='skyblue', kde=True)
plt.title('Distribution of Scores (95th Percentile)')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Count the number of words in each joke
from nltk.tokenize import word_tokenize
from joblib import Parallel, delayed

# Apply function in parallel
cleaned_jokes['tokenized_joke'] = Parallel(n_jobs=-1)(delayed(word_tokenize)(joke) for joke in cleaned_jokes['cleaned_joke'])

# Calculate the number of words in each joke
cleaned_jokes['num_words'] = cleaned_jokes['tokenized_joke'].apply(len)

# Get 95 percentile of the number of words
limit = cleaned_jokes['num_words'].quantile(0.95)

In [None]:
# Plot the distribution of joke lengths
downsample_data_95 = cleaned_jokes[cleaned_jokes['num_words'] <= limit]

plt.figure(figsize=(10, 6))
sns.histplot(downsample_data_95['num_words'], bins=50, color='skyblue', kde=True)
plt.title('Distribution of Joke Lengths (95th Percentile)')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
def filter_below_quantile(df, column, quantile=0.95):
    original_count = len(df)
    filtered_df = df[df[column] <= df[column].quantile(quantile)]
    removed_count = original_count - len(filtered_df)
    print(f"Removed {removed_count} rows based on {column}")
    return filtered_df

cleaned_jokes = filter_below_quantile(cleaned_jokes, 'score')
cleaned_jokes = filter_below_quantile(cleaned_jokes, 'num_words')

In [None]:
# make five classes of scores based on quantiles
labels = [0, 1, 2, 3, 4]
cleaned_jokes['score_class'] = pd.qcut(cleaned_jokes['score'], 5, duplicates='drop', labels=labels)

cleaned_jokes['score_class'].value_counts()

In [None]:
# Import from imblearn
from imblearn.under_sampling import RandomUnderSampler

def undersampling(df, target_column, sample_strategy='auto'):
    """
    undersampling the dataset to balance the distribution of the target column.
    
    Args:
        df (pd.DataFrame): The input dataframe.
        target_column (str): The column to balance.
        sample_strategy (str): The sampling strategy to use.
        
    Returns:
        pd.DataFrame: The downsampled dataframe.
    """
    # Create the undersampler
    undersampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=42)

    # Separate the features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Fit the data to the undersampler
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    # Combine the features and target
    df_resampled = pd.concat([X_resampled, y_resampled], axis=1)

    return df_resampled

In [None]:
cleaned_jokes = undersampling(cleaned_jokes, 'score_class', sample_strategy='auto')

In [None]:
cleaned_jokes['score_class'].value_counts()

In [None]:
cleaned_jokes = cleaned_jokes[['cleaned_joke', 'score_class']]
cleaned_jokes= cleaned_jokes.rename(columns={'cleaned_joke': 'text', 'score_class': 'label'})
cleaned_jokes.to_parquet('../data/interim/cleaned_jokes.parquet', index=False)

In [None]:
# Clean for use in Pretraining
pretraining_data = cleaned_jokes[['cleaned_joke', 'score_class']]
pretraining_data = pretraining_data.rename(columns={'cleaned_joke': 'text', 'score_class': 'label'})
pretraining_data.to_parquet('../data/processed/pretraining_data.parquet', index=False)

In [None]:
pretraining_data = pd.read_parquet('../data/processed/pretraining_data.parquet')

In [None]:
pretraining_data.head()

In [None]:
# Print a joke from each class
labels = [0, 1, 2, 3, 4]
for label in labels:
    print(f"Class {label}:")
    print(pretraining_data[pretraining_data['label'] == label].iloc[0]['text'])
    print()