In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [2]:
df = pd.read_csv('cleaned_dataset_final.csv')

In [3]:
def clean_text_custom(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[!*\~/\-_+;:><{}]', '', text)  # Remove specified punctuation
    text = re.sub(r'[^\x00-\x7F]', '', text)    # Remove emojis and non-ASCII characters
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text


In [4]:
# Clean in-place
df['Title'] = df['Title'].apply(clean_text_custom)
df['Body'] = df['Body'].apply(clean_text_custom)

In [5]:
df['Body'] = df['Body'].str.replace(r'\(\d{0,9}[mfMF]\)', '', regex=True)

# Optional: also strip extra spaces left behind
df['Body'] = df['Body'].str.replace(r'\s{2,}', ' ', regex=True).str.strip()


In [6]:
for col in ['Title', 'Body']:
    df[col] = df[col].str.replace(r'\(\d{0,9}[mfMF]\)', '', regex=True)
    df[col] = df[col].str.replace(r'\s{2,}', ' ', regex=True).str.strip()

In [7]:
df.to_csv('dataset.csv', index=False)

In [8]:
df = pd.read_csv('dataset.csv')

In [9]:
# Remove all double quotation marks from the 'Title' column
df['Title'] = df['Title'].str.replace('"', '', regex=False)

In [10]:
remove_patterns = [
    r'\(22m gay\)',
    r'\(31f, scottish\)',
    r'\(rant\)',
    r'\(trigger warning\)',
    r'\(vent\)',
    r'\[ removed by reddit \]',
    r'\[17 f\]',
    r'\[17\]',
    r'\[18m\]',
    r'\[19 f\]',
    r'\[21f\]',
    r'\[6\]',
    r'\[crosspost\]',
    r'\[delete if not allowed\]',
    r'\[meta\]',
    r'\[mod approved\]',
    r'\[pa\]',
    r'\[rant\]',
    r'\[repost\]',
    r'\[vent\]',
    r'\[urgent\]',
    r'\[update\]',
    r'\[webinar\]'
]

# Combine all patterns into a single regex
pattern = '|'.join(remove_patterns)

# Remove the patterns from the Title column
df['Title'] = df['Title'].str.replace(pattern, '', flags=re.IGNORECASE, regex=True)

# Clean up extra whitespace
df['Title'] = df['Title'].str.replace(r'\s{2,}', ' ', regex=True).str.strip()


In [11]:
df.head(5)

Unnamed: 0,Title,Body,Category,Subcategory,Specific Disorders,Type
0,make a loud ughhhhsound like a vocal tic ?,my doctor told me 15yo my ad was bc i moved to...,Trauma & Stressor-Related Disorders,PTSD Spectrum,Adjustment Disorder,Self-Describe
1,diagnosed with adjustment disorder,is it wrong that i m really hurt by this diagn...,Trauma & Stressor-Related Disorders,PTSD Spectrum,Adjustment Disorder,Self-Describe
2,this will never feel like home,i'm a phd student diagnosed with adjustment di...,Trauma & Stressor-Related Disorders,PTSD Spectrum,Adjustment Disorder,Self-Describe
3,"even when things get better, i just get worse",idk what to do anymore and i don't know how to...,Trauma & Stressor-Related Disorders,PTSD Spectrum,Adjustment Disorder,Self-Describe
4,talk therapy,does talk therapy help in any way for someone ...,Trauma & Stressor-Related Disorders,PTSD Spectrum,Adjustment Disorder,Self-Describe


In [12]:
# Show how many null values are in each column
print("Null values per column:")
print(df.isnull().sum())

# Show total null values in the dataset
print("\nTotal null values:", df.isnull().sum().sum())

# Optionally, show rows that contain null values
print("\nRows with null values:")
print(df[df.isnull().any(axis=1)])

Null values per column:
Title                 2
Body                  1
Category              0
Subcategory           0
Specific Disorders    0
Type                  0
dtype: int64

Total null values: 3

Rows with null values:
                                                   Title  \
6978                                                 NaN   
7675   i m in recovery from ana but eat all night and...   
10014                                                NaN   

                                                    Body  \
6978   i wish this didn't effect my relationships. i ...   
7675                                                 NaN   
10014                    i should i could i want too but   

                    Category                         Subcategory  \
6978   Personality Disorders         Cluster C (Anxious/Fearful)   
7675        Eating Disorders  Restrictive/Compensatory Disorders   
10014         Mood Disorders                   Bipolar Disorders   

                  

In [13]:
# Drop rows with any null values
df_cleaned = df.dropna()

In [14]:
print(df_cleaned.isnull().sum())

Title                 0
Body                  0
Category              0
Subcategory           0
Specific Disorders    0
Type                  0
dtype: int64


In [15]:
# Check for duplicate Titles
duplicate_titles = df[df.duplicated(subset=['Title'], keep=False)]
print("Duplicate Titles:\n", duplicate_titles)

# Check for duplicate Bodies
duplicate_bodies = df[df.duplicated(subset=['Body'], keep=False)]
print("\nDuplicate Bodies:\n", duplicate_bodies)

# Check for duplicates in both Title and Body together
duplicate_title_body = df[df.duplicated(subset=['Title', 'Body'], keep=False)]
print("\nDuplicate Title + Body:\n", duplicate_title_body)

# Count of duplicates
print("\nCounts:")
print("Duplicate Titles:", duplicate_titles.shape[0])
print("Duplicate Bodies:", duplicate_bodies.shape[0])
print("Duplicate Title+Body pairs:", duplicate_title_body.shape[0])

Duplicate Titles:
                                                    Title  \
4                                           talk therapy   
5                                     just got diagnosed   
10                                             diagnosis   
22                                           i m curious   
30                                           please help   
...                                                  ...   
34113                                           klonopin   
34132  it sounds pathetic, but speaking to ai really ...   
34174                                   i couldn t do it   
34184                                     making friends   
34200                                               tips   

                                                    Body  \
4      does talk therapy help in any way for someone ...   
5      hello, got diagnosed with this and i have noo ...   
10     i had no idea that i was diagnosed with adjust...   
22     do you get ve

In [16]:
# Load dataset


# Drop duplicate rows where both Title and Body are the same
df_cleaned1 = df_cleaned.drop_duplicates(subset=['Title', 'Body'], keep='first')

# Reset index (optional, for neatness)
df_cleaned1 = df_cleaned1.reset_index(drop=True)

print("Original shape:", df_cleaned.shape)
print("Cleaned shape:", df_cleaned1.shape)

Original shape: (34212, 6)
Cleaned shape: (34121, 6)


In [17]:
print(df_cleaned1.isnull().sum())

Title                 0
Body                  0
Category              0
Subcategory           0
Specific Disorders    0
Type                  0
dtype: int64


In [18]:
# Calculate word count for each Body
df_cleaned1['Body_WordCount'] = df_cleaned1['Body'].astype(str).apply(lambda x: len(x.split()))

# Find the minimum word count
min_word_count = df_cleaned1['Body_WordCount'].min()

# Show the row(s) with the minimum word count
min_rows = df_cleaned1[df_cleaned1['Body_WordCount'] == min_word_count]

print("Minimum word count in Body:", min_word_count)
print("\nRows with minimum word count:\n", min_rows[['Title', 'Body', 'Body_WordCount']])

Minimum word count in Body: 1

Rows with minimum word count:
                                                    Title  \
139           the discord server for anyone interested )   
581    would recommend for everyone to watch these vi...   
1489      why do i get drunk and want to clean my house?   
1982   i have never gotten my hunger cues back even a...   
2102   came across my journal from when i was 11 year...   
...                                                  ...   
33329  do you guys tell people that you have social a...   
33584            what has been your best and worst jobs?   
33752  i feel bad about myself after looking at beaut...   
33914  komi can t communicate creator prioritizes fam...   
34085  it will be nice to experience what it is to li...   

                                                    Body  Body_WordCount  
139                              httpsdiscord.ggxrsymhfr               1  
581    [httpswww.youtube.comwatch?v=npisz268ooi](http...           

In [19]:
# Calculate word count for each Body
df_cleaned1['Body_WordCount'] = df_cleaned1['Body'].astype(str).apply(lambda x: len(x.split()))

# Count rows where Body word length < 100
below_100_count = (df_cleaned1['Body_WordCount'] < 25).sum()

print("Number of rows with Body word length below 25:", below_100_count)

Number of rows with Body word length below 25: 2392


In [20]:
# Calculate word count for each Body
df_cleaned1['Body_WordCount'] = df_cleaned1['Body'].astype(str).apply(lambda x: len(x.split()))

# Remove rows where Body word length < 100
df_cleaned2 = df_cleaned1[df_cleaned1['Body_WordCount'] >= 25].reset_index(drop=True)

print("Original shape:", df_cleaned1.shape)
print("After removing rows with Body word length < 100:", df_cleaned2.shape)

Original shape: (34121, 7)
After removing rows with Body word length < 100: (31729, 7)


In [21]:
df_cleaned2.to_csv('updated_dataset.csv', index=False)

##Final Check in the dataser

In [22]:
df = pd.read_csv('updated_dataset.csv')

In [23]:
import pandas as pd



print("===== NULL VALUE CHECK =====")
print(df.isnull().sum())
print("Total null values:", df.isnull().sum().sum())

print("\n===== DUPLICATE CHECK =====")
duplicate_count = df.duplicated(subset=['Title', 'Body']).sum()
print("Duplicate Title+Body rows:", duplicate_count)

print("\n===== MINIMUM BODY WORD LENGTH =====")
# Add word count column
df['Body_WordCount'] = df['Body'].astype(str).apply(lambda x: len(x.split()))
min_word_count = df['Body_WordCount'].min()
print("Minimum Body word length:", min_word_count)




===== NULL VALUE CHECK =====
Title                 0
Body                  0
Category              0
Subcategory           0
Specific Disorders    0
Type                  0
Body_WordCount        0
dtype: int64
Total null values: 0

===== DUPLICATE CHECK =====
Duplicate Title+Body rows: 0

===== MINIMUM BODY WORD LENGTH =====
Minimum Body word length: 25


# dataset part 1 random sort

In [3]:
df_part1 = pd.read_csv('cleaned_dataset_final_part1.csv')

In [4]:
# Shuffle the rows randomly
df_part1 = df_part1.sample(frac=1, random_state=42).reset_index(drop=True)

In [5]:
df.to_csv('cleaned_dataset_final_part1_randomsort.csv', index=False)

In [6]:
# Check if any column contains null values
has_nulls = df_part1.isnull().any().any()
print("Any null values in dataset?:", has_nulls)


Any null values in dataset?: False


In [7]:
# Show count of nulls in each column (only if greater than 0)
null_counts = df_part1.isnull().sum()
print(null_counts[null_counts > 0])


Series([], dtype: int64)


# spliting dataset into 2

In [2]:
df = pd.read_csv('dataset.csv')

In [4]:
# Calculate the midpoint index
mid_index = len(df) // 2

# Split into two halves
df_part1 = df.iloc[:mid_index].reset_index(drop=True)
df_part2 = df.iloc[mid_index:].reset_index(drop=True)


In [5]:
df_part1.to_csv("dataset_part1.csv", index=False)
df_part2.to_csv("dataset_part2.csv", index=False)

# spliting dataset into 3

In [4]:
# Calculate split indices
total_len = len(df)
split1 = total_len // 3
split2 = 2 * total_len // 3

# Split the dataset into 3 parts
df_part1 = df.iloc[:split1].reset_index(drop=True)
df_part2 = df.iloc[split1:split2].reset_index(drop=True)
df_part3 = df.iloc[split2:].reset_index(drop=True)

In [5]:
df_part1.to_csv("3dataset_part1.csv", index=False)
df_part2.to_csv("3dataset_part2.csv", index=False)
df_part3.to_csv("3dataset_part3.csv", index=False)