# Import libraries

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load Data

In [2]:
df1 = pd.read_csv('Datasets/R_abusiveparents/abusedteens_labeled.csv')
df2 = pd.read_csv('Datasets/R_lgtbteens/lgtbteens_labeled.csv')
df3 = pd.read_csv('Datasets/R_teenagers/teenagers_labeled.csv')

# Combine DataFrames

df = pd.concat([df1, df2, df3], ignore_index=True)

print(df.columns)
print(df.head())

Index(['Posts', 'label'], dtype='object')
                                               Posts  \
0  so my mum is textbook abusive but not often en...   
1  hello i have never made a reddit post before b...   
2  i m have tried to be nice to my parents and su...   
3  i need some advice it will be a bit long but i...   
4  hello im f from the ph with no close friends o...   

                          label  
0                      Positive  
1  Depression/Suicidal Thoughts  
2                      Positive  
3  Depression/Suicidal Thoughts  
4                       Neutral  


# Tokenization

In [3]:
# Ensure all entries in 'Posts' are strings and handle missing values
df['Posts'] = df['Posts'].astype(str)  # Convert all values to strings
df['Posts'].fillna('', inplace=True)   # Replace NaN values with empty strings

df['tokenized_text'] = df['Posts'].apply(word_tokenize)
print(df.head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Posts'].fillna('', inplace=True)   # Replace NaN values with empty strings


                                               Posts  \
0  so my mum is textbook abusive but not often en...   
1  hello i have never made a reddit post before b...   
2  i m have tried to be nice to my parents and su...   
3  i need some advice it will be a bit long but i...   
4  hello im f from the ph with no close friends o...   

                          label  \
0                      Positive   
1  Depression/Suicidal Thoughts   
2                      Positive   
3  Depression/Suicidal Thoughts   
4                       Neutral   

                                      tokenized_text  
0  [so, my, mum, is, textbook, abusive, but, not,...  
1  [hello, i, have, never, made, a, reddit, post,...  
2  [i, m, have, tried, to, be, nice, to, my, pare...  
3  [i, need, some, advice, it, will, be, a, bit, ...  
4  [hello, im, f, from, the, ph, with, no, close,...  


# Remove Stopwords

In [4]:
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

df['filtered_text'] = df['tokenized_text'].apply(remove_stopwords)

# Lemmatization

In [5]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df['lemmatized_text'] = df['filtered_text'].apply(lemmatize_words)


# Save tokenized text as CSV

In [None]:
df.to_csv('token_datasets.csv', index=False)