In [50]:
import pandas as pd 
import numpy as np 
import os
import re

In [51]:
csv_files = ['divorce.csv', 'jokes.csv', 'politics.csv']

In [52]:
def clean_content(text, author):
    # 1. Remove boilerplate prefix
    text = re.sub(r'^reply to this post rate flag\s*', '', text, flags=re.IGNORECASE)
    # 2. Remove author tag based on author value
    #    e.g., if author == "FYI-ItsMe", this removes "< FYI-ItsMe >"
    pattern = rf'<\s*{re.escape(author)}\s*>'
    text = re.sub(pattern, '', text)
    # 3. Remove URLs
    text = re.sub(r'https?://\S+', '', text)
    # 4. Remove full date-time patterns (e.g., 2025-04-28 17:08)
    text = re.sub(r'\d{4}-\d{2}-\d{2}\s*\d{1,2}:\d{2}', '', text)
    # 5. Remove standalone time patterns (e.g., 17:08)
    text = re.sub(r'\b\d{1,2}:\d{2}\b', '', text)
    return text.strip()

In [53]:
def combine_cleaned_files(csv_files):
    """Combine all cleaned CSV files into one DataFrame"""
    all_data = []
    
    for csv_file in csv_files:
        cleaned_file = f'cleaned_{csv_file}'
        print(f"Reading {cleaned_file}...")
        df = pd.read_csv(cleaned_file)
        all_data.append(df)
    
    # Combine all DataFrames
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Save the combined data
    combined_df.to_csv('clean1.csv', index=False)
    print("\nCombined all cleaned data into 'clean1.csv'")
    print(f"Total number of rows: {len(combined_df)}")
    print(f"Columns: {', '.join(combined_df.columns)}")
    
    return combined_df

In [54]:
for csv_file in csv_files:
    print(f"Processing {csv_file}...")
    
    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    # Clean the content
    df['content_clean'] = df.apply(lambda row: clean_content(row['content'], row['author']), axis=1)
    
    # Save the cleaned data to a new CSV file
    output_file = f'cleaned_{csv_file}'
    df.to_csv(output_file, index=False)
    print(f"Saved cleaned data to {output_file}")
    
    # Print the first few rows of cleaned content
    print(f"\nFirst few rows of cleaned content from {csv_file}:")
    print(df['content_clean'].head())
    print("-" * 50)


Processing divorce.csv...
Saved cleaned data to cleaned_divorce.csv

First few rows of cleaned content from divorce.csv:
0    Divorce documentary seeks Christians   Divorce...
1    New child support question:   Got my filing. S...
2          The Judge will have a lot to consider. GL §
3    That's pretty much noise   He will claim X, Y,...
4    Support   Judges normally don't care about the...
Name: content_clean, dtype: object
--------------------------------------------------
Processing jokes.csv...
Saved cleaned data to cleaned_jokes.csv

First few rows of cleaned content from jokes.csv:
0    A guy goes to...   ...church to confess. "Forg...
1    Clever Words For Clever People   1. ARBITRAITO...
2    My friend and I were both born on 4/20.   We'r...
3    A old Soviet Jewish man applies for an exit vi...
4    Go on my son   A man went to church to confess...
Name: content_clean, dtype: object
--------------------------------------------------
Processing politics.csv...
Saved cleaned d

In [55]:
print("\nCombining all cleaned files...")
combined_data = combine_cleaned_files(csv_files) 


Combining all cleaned files...
Reading cleaned_divorce.csv...
Reading cleaned_jokes.csv...
Reading cleaned_politics.csv...

Combined all cleaned data into 'clean1.csv'
Total number of rows: 1951
Columns: Category, thread_id, post_id, title, author, time, content, nesting_level, is_first_post, page_num, url, content_clean


# Cleaning Part 2 

In [56]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# from flair.models import TextClassifier
import re

[nltk_data] Downloading package wordnet to /Users/mj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/mj/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [57]:
df = pd.read_csv('clean1.csv')

In [58]:
cleaned_df = df[['Category', 'thread_id', 'post_id', 'title', 'author', 'time', 'content_clean']]

In [59]:
cleaned_df.to_csv('clean2.csv', index=False)

# Cleaning Part 3

In [60]:
df = pd.read_csv('clean2.csv')
df.head()

Unnamed: 0,Category,thread_id,post_id,title,author,time,content_clean
0,divorce,1,1,Divorce documentary seeks Christians,AttackOnMarriage,2025/4/11 11:16,Divorce documentary seeks Christians Divorce...
1,divorce,2,2,New child support question:,SignHere2208,2025/3/16 10:27,New child support question: Got my filing. S...
2,divorce,2,3,The Judge will have a lot to consider. GL §,HereAgainOC,2025/3/17 17:16,The Judge will have a lot to consider. GL §
3,divorce,2,4,That's pretty much noise,nobodyatdevnull,2025/3/18 23:53,"That's pretty much noise He will claim X, Y,..."
4,divorce,2,5,Support,heavy-handed,16:04,Support Judges normally don't care about the...


In [61]:
df = df.rename(columns={'Category': 'category'})
df.head()


Unnamed: 0,category,thread_id,post_id,title,author,time,content_clean
0,divorce,1,1,Divorce documentary seeks Christians,AttackOnMarriage,2025/4/11 11:16,Divorce documentary seeks Christians Divorce...
1,divorce,2,2,New child support question:,SignHere2208,2025/3/16 10:27,New child support question: Got my filing. S...
2,divorce,2,3,The Judge will have a lot to consider. GL §,HereAgainOC,2025/3/17 17:16,The Judge will have a lot to consider. GL §
3,divorce,2,4,That's pretty much noise,nobodyatdevnull,2025/3/18 23:53,"That's pretty much noise He will claim X, Y,..."
4,divorce,2,5,Support,heavy-handed,16:04,Support Judges normally don't care about the...


In [62]:
df['category'].unique()

array(['divorce', 'Jokes', 'Politics'], dtype=object)

# Dropping Unrelated columns

In [63]:
df = df.drop(columns=['thread_id', 'post_id', 'time', 'author'])
df.head()



Unnamed: 0,category,title,content_clean
0,divorce,Divorce documentary seeks Christians,Divorce documentary seeks Christians Divorce...
1,divorce,New child support question:,New child support question: Got my filing. S...
2,divorce,The Judge will have a lot to consider. GL §,The Judge will have a lot to consider. GL §
3,divorce,That's pretty much noise,"That's pretty much noise He will claim X, Y,..."
4,divorce,Support,Support Judges normally don't care about the...


In [64]:
df.to_csv('clean3.csv', index=False)

# Copying to new DF

In [65]:
new_df = df.copy()
new_df.head()

Unnamed: 0,category,title,content_clean
0,divorce,Divorce documentary seeks Christians,Divorce documentary seeks Christians Divorce...
1,divorce,New child support question:,New child support question: Got my filing. S...
2,divorce,The Judge will have a lot to consider. GL §,The Judge will have a lot to consider. GL §
3,divorce,That's pretty much noise,"That's pretty much noise He will claim X, Y,..."
4,divorce,Support,Support Judges normally don't care about the...


# Normalize Whitespace & Casing

In [66]:
new_df['content_clean'] = (new_df['content_clean']
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
    .str.lower()
)
new_df.head()

Unnamed: 0,category,title,content_clean
0,divorce,Divorce documentary seeks Christians,divorce documentary seeks christians divorce d...
1,divorce,New child support question:,new child support question: got my filing. stb...
2,divorce,The Judge will have a lot to consider. GL §,the judge will have a lot to consider. gl §
3,divorce,That's pretty much noise,"that's pretty much noise he will claim x, y, a..."
4,divorce,Support,support judges normally don't care about the s...


In [67]:
new_df.drop(columns=['title'], inplace=True)
new_df.head()

Unnamed: 0,category,content_clean
0,divorce,divorce documentary seeks christians divorce d...
1,divorce,new child support question: got my filing. stb...
2,divorce,the judge will have a lot to consider. gl §
3,divorce,"that's pretty much noise he will claim x, y, a..."
4,divorce,support judges normally don't care about the s...


In [68]:
new_df['content_clean'] = new_df['content_clean'].str.replace(r'[^\w\s]', ' ', regex=True)
new_df.head()

Unnamed: 0,category,content_clean
0,divorce,divorce documentary seeks christians divorce d...
1,divorce,new child support question got my filing stb...
2,divorce,the judge will have a lot to consider gl
3,divorce,that s pretty much noise he will claim x y a...
4,divorce,support judges normally don t care about the s...


In [77]:
new_df.to_csv('clean4.csv', index=False)

In [78]:
df2 = new_df[new_df['category'].isin(['Jokes', 'divorce', 'Politics'])]
df2['category'].nunique()

3

In [79]:
df2.to_csv('jokes_divorce.csv', index=False)