# Jakub Bandurski, Michał Bryzik, Kacper Gruca

In [5]:
import pandas as pd
df = pd.read_csv('../data/tifu.csv')


## Text cleaning

In [6]:
df.head()

Unnamed: 0,ups,num_comments,upvote_ratio,score,documents,tldr,title
0,115.0,23.0,0.88,115.0,this actually happened a couple of years ago. ...,confuse a 5th grade girl for a boy in front of...,gender-stereotyping
1,16.0,12.0,0.79,16.0,"it was last october, but i'm feeling the fall-...","i found my estranged dad, thought i loved him ...",telling my dad that i love him.
2,55.0,10.0,0.85,55.0,so i had the brilliant idea to use veet hair r...,had my balls burned by sauron and was left dev...,i was deveeted...
3,90.0,20.0,0.92,90.0,today i was going to have a bath after a long ...,peppermint + bath = burning cold ladybits.,wanting a pepperminty bath.
4,81.0,18.0,0.79,81.0,"i haven't had a bath in practically years so, ...","got too high and too hot in the bath, almost c...",having a spliff in the bath.


In [7]:
# Import the regex module
import re

# Define a function to clean text using regex
def regex_clean(text):
    # Remove all characters that are not alphanumeric, spaces, newlines, or periods
    text = re.sub("[^a-zA-Z0-9' \n\.]", '', text)
    
    # Replace multiple spaces with a single space
    text = re.sub(' +', ' ', text)
    
    # Remove all characters that are not alphanumeric or spaces
    text = re.sub(r"[^\w\s']",'', text)
    
    # Remove all digits
    text = re.sub('\d', '', text)
    
    # Convert the text to lowercase
    text = text.lower()
    
    # Return the cleaned text
    return text

In [8]:
# clean columns
# the function from above did not work without the if statement because some of the values were not strings
df["documents_clean"] = df["documents"].apply(lambda x: regex_clean(x) if isinstance(x, str) else x)
df["tldr_clean"] = df["tldr"].apply(lambda x: regex_clean(x) if isinstance(x, str) else x)
df["title_clean"] = df["title"].apply(lambda x: regex_clean(x) if isinstance(x, str) else x)

In [9]:
# check this non string rows, probably they will be NaN
non_string_rows_document = df["documents"].apply(lambda x: not isinstance(x, str))
non_string_rows_tldr = df["tldr"].apply(lambda x: not isinstance(x, str))
non_string_rows_title = df["title"].apply(lambda x: not isinstance(x, str))

# take the index of the non string rows
non_string_index_document = non_string_rows_document[non_string_rows_document == True].index
non_string_index_tldr = non_string_rows_tldr[non_string_rows_tldr == True].index
non_string_index_title = non_string_rows_title[non_string_rows_title == True].index

# print the indexes
print(non_string_index_document)
print(non_string_index_tldr)
print(non_string_index_title)

Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')


In [10]:
# check these rows
print(df.iloc[35560])
print(df.iloc[35559])

ups                                                             25.0
num_comments                                                    22.0
upvote_ratio                                                    0.76
score                                                           25.0
documents          this actually happened less than 10 minutes ag...
tldr               smoked half a dozen joints at work, left to mc...
title                                 accidently getting 2 kids high
documents_clean    this actually happened less than  minutes ago ...
tldr_clean         smoked half a dozen joints at work left to mcd...
title_clean                            accidently getting  kids high
Name: 35560, dtype: object
ups                                                              3.0
num_comments                                                     4.0
upvote_ratio                                                    0.72
score                                                            3.0
documen

In [11]:
# these rows are just missing values, so we can drop them
df = df.drop(df.index[[35560, 35559]])

In [12]:
df.head()

Unnamed: 0,ups,num_comments,upvote_ratio,score,documents,tldr,title,documents_clean,tldr_clean,title_clean
0,115.0,23.0,0.88,115.0,this actually happened a couple of years ago. ...,confuse a 5th grade girl for a boy in front of...,gender-stereotyping,this actually happened a couple of years ago i...,confuse a th grade girl for a boy in front of ...,genderstereotyping
1,16.0,12.0,0.79,16.0,"it was last october, but i'm feeling the fall-...","i found my estranged dad, thought i loved him ...",telling my dad that i love him.,it was last october but i'm feeling the fallou...,i found my estranged dad thought i loved him a...,telling my dad that i love him
2,55.0,10.0,0.85,55.0,so i had the brilliant idea to use veet hair r...,had my balls burned by sauron and was left dev...,i was deveeted...,so i had the brilliant idea to use veet hair r...,had my balls burned by sauron and was left dev...,i was deveeted
3,90.0,20.0,0.92,90.0,today i was going to have a bath after a long ...,peppermint + bath = burning cold ladybits.,wanting a pepperminty bath.,today i was going to have a bath after a long ...,peppermint bath burning cold ladybits,wanting a pepperminty bath
4,81.0,18.0,0.79,81.0,"i haven't had a bath in practically years so, ...","got too high and too hot in the bath, almost c...",having a spliff in the bath.,i haven't had a bath in practically years so a...,got too high and too hot in the bath almost co...,having a spliff in the bath


## Stop words removal

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [14]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stop_words = set(stopwords.words("english"))

# remove additional subject-specific stopwards
for el in ["tifu", "nsfw", "today", "I", "fucked", "up", "fuck"]:
    stop_words.add(el)

for sw in stop_words:
    if "'" in sw:
        sw.replace("'","")

In [17]:
# removing stopwords
def remove_stopwords(text, word_tokenizer=word_tokenize, stop_words=stop_words):
    word_tokens = word_tokenize(text)
    return " ".join([w for w in word_tokens if not w in stop_words])

In [18]:
# remove stopwords from columns
df["documents_no_stopwords"] = df["documents_clean"].apply(lambda x: remove_stopwords(text=x, word_tokenizer=word_tokenize, stop_words=stop_words))
df["tldr_no_stopwords"] = df["tldr_clean"].apply(lambda x: remove_stopwords(text=x, word_tokenizer=word_tokenize, stop_words=stop_words))
df["title_no_stopwords"] = df["title_clean"].apply(lambda x: remove_stopwords(text=x, word_tokenizer=word_tokenize, stop_words=stop_words))

In [19]:
df.head()

Unnamed: 0,ups,num_comments,upvote_ratio,score,documents,tldr,title,documents_clean,tldr_clean,title_clean,documents_no_stopwords,tldr_no_stopwords,title_no_stopwords
0,115.0,23.0,0.88,115.0,this actually happened a couple of years ago. ...,confuse a 5th grade girl for a boy in front of...,gender-stereotyping,this actually happened a couple of years ago i...,confuse a th grade girl for a boy in front of ...,genderstereotyping,actually happened couple years ago grew german...,confuse th grade girl boy front half class kid...,genderstereotyping
1,16.0,12.0,0.79,16.0,"it was last october, but i'm feeling the fall-...","i found my estranged dad, thought i loved him ...",telling my dad that i love him.,it was last october but i'm feeling the fallou...,i found my estranged dad thought i loved him a...,telling my dad that i love him,last october 'm feeling fallout mom died cance...,found estranged dad thought loved getting know...,telling dad love
2,55.0,10.0,0.85,55.0,so i had the brilliant idea to use veet hair r...,had my balls burned by sauron and was left dev...,i was deveeted...,so i had the brilliant idea to use veet hair r...,had my balls burned by sauron and was left dev...,i was deveeted,brilliant idea use veet hair removal gel ol ' ...,balls burned sauron left deveeted,deveeted
3,90.0,20.0,0.92,90.0,today i was going to have a bath after a long ...,peppermint + bath = burning cold ladybits.,wanting a pepperminty bath.,today i was going to have a bath after a long ...,peppermint bath burning cold ladybits,wanting a pepperminty bath,going bath long day painting kids faces carniv...,peppermint bath burning cold ladybits,wanting pepperminty bath
4,81.0,18.0,0.79,81.0,"i haven't had a bath in practically years so, ...","got too high and too hot in the bath, almost c...",having a spliff in the bath.,i haven't had a bath in practically years so a...,got too high and too hot in the bath almost co...,having a spliff in the bath,n't bath practically years finally moving hous...,got high hot bath almost cooked like lobster,spliff bath


## Stemming

In [20]:
from nltk.stem import PorterStemmer

In [21]:
def stemming(text, stemmer=None):
    if stemmer is None:
        stemmer = PorterStemmer()
    
    output = []
    for word in text.split():
        try:
            stemmed_word = stemmer.stem(word)
            output.append(stemmed_word)
        except:
            output.append(word)
    return " ".join(output)

In [22]:
# apply stemming to columns
df["documents_stemmed"] = df["documents_no_stopwords"].apply(stemming)
df["tldr_stemmed"] = df["tldr_no_stopwords"].apply(stemming)
df["title_stemmed"] = df["title_no_stopwords"].apply(stemming)

In [23]:
df.head()

Unnamed: 0,ups,num_comments,upvote_ratio,score,documents,tldr,title,documents_clean,tldr_clean,title_clean,documents_no_stopwords,tldr_no_stopwords,title_no_stopwords,documents_stemmed,tldr_stemmed,title_stemmed
0,115.0,23.0,0.88,115.0,this actually happened a couple of years ago. ...,confuse a 5th grade girl for a boy in front of...,gender-stereotyping,this actually happened a couple of years ago i...,confuse a th grade girl for a boy in front of ...,genderstereotyping,actually happened couple years ago grew german...,confuse th grade girl boy front half class kid...,genderstereotyping,actual happen coupl year ago grew germani went...,confus th grade girl boy front half class kid ...,genderstereotyp
1,16.0,12.0,0.79,16.0,"it was last october, but i'm feeling the fall-...","i found my estranged dad, thought i loved him ...",telling my dad that i love him.,it was last october but i'm feeling the fallou...,i found my estranged dad thought i loved him a...,telling my dad that i love him,last october 'm feeling fallout mom died cance...,found estranged dad thought loved getting know...,telling dad love,last octob 'm feel fallout mom die cancer last...,found estrang dad thought love get know got kn...,tell dad love
2,55.0,10.0,0.85,55.0,so i had the brilliant idea to use veet hair r...,had my balls burned by sauron and was left dev...,i was deveeted...,so i had the brilliant idea to use veet hair r...,had my balls burned by sauron and was left dev...,i was deveeted,brilliant idea use veet hair removal gel ol ' ...,balls burned sauron left deveeted,deveeted,brilliant idea use veet hair remov gel ol ' da...,ball burn sauron left deveet,deveet
3,90.0,20.0,0.92,90.0,today i was going to have a bath after a long ...,peppermint + bath = burning cold ladybits.,wanting a pepperminty bath.,today i was going to have a bath after a long ...,peppermint bath burning cold ladybits,wanting a pepperminty bath,going bath long day painting kids faces carniv...,peppermint bath burning cold ladybits,wanting pepperminty bath,go bath long day paint kid face carniv would g...,peppermint bath burn cold ladybit,want pepperminti bath
4,81.0,18.0,0.79,81.0,"i haven't had a bath in practically years so, ...","got too high and too hot in the bath, almost c...",having a spliff in the bath.,i haven't had a bath in practically years so a...,got too high and too hot in the bath almost co...,having a spliff in the bath,n't bath practically years finally moving hous...,got high hot bath almost cooked like lobster,spliff bath,n't bath practic year final move hous bath dec...,got high hot bath almost cook like lobster,spliff bath


## Lemmatization

Continuing our analysis we noticed that in the most common words are words like "get" and "got", so we decided to perform also lemmatization to handle irregular verbs and other exceptions.

In [24]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [25]:
# Download required NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...


True

In [26]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_output = []
    for word in text.split():
        lemma = lemmatizer.lemmatize(word, get_wordnet_pos(word))
        lemmatized_output.append(lemma)
    return " ".join(lemmatized_output)

In [27]:
# apply leammatization to columns
df["documents_lem"] = df["documents_stemmed"].apply(lemmatize_text)
df["tldr_lem"] = df["tldr_stemmed"].apply(lemmatize_text)
df["title_lem"] = df["title_stemmed"].apply(lemmatize_text)

In [28]:
df.isna().sum()

ups                       0
num_comments              0
upvote_ratio              0
score                     0
documents                 0
tldr                      0
title                     0
documents_clean           0
tldr_clean                0
title_clean               0
documents_no_stopwords    0
tldr_no_stopwords         0
title_no_stopwords        0
documents_stemmed         0
tldr_stemmed              0
title_stemmed             0
documents_lem             0
tldr_lem                  0
title_lem                 0
dtype: int64

In [29]:
df.head()

Unnamed: 0,ups,num_comments,upvote_ratio,score,documents,tldr,title,documents_clean,tldr_clean,title_clean,documents_no_stopwords,tldr_no_stopwords,title_no_stopwords,documents_stemmed,tldr_stemmed,title_stemmed,documents_lem,tldr_lem,title_lem
0,115.0,23.0,0.88,115.0,this actually happened a couple of years ago. ...,confuse a 5th grade girl for a boy in front of...,gender-stereotyping,this actually happened a couple of years ago i...,confuse a th grade girl for a boy in front of ...,genderstereotyping,actually happened couple years ago grew german...,confuse th grade girl boy front half class kid...,genderstereotyping,actual happen coupl year ago grew germani went...,confus th grade girl boy front half class kid ...,genderstereotyp,actual happen coupl year ago grow germani go g...,confus th grade girl boy front half class kid ...,genderstereotyp
1,16.0,12.0,0.79,16.0,"it was last october, but i'm feeling the fall-...","i found my estranged dad, thought i loved him ...",telling my dad that i love him.,it was last october but i'm feeling the fallou...,i found my estranged dad thought i loved him a...,telling my dad that i love him,last october 'm feeling fallout mom died cance...,found estranged dad thought loved getting know...,telling dad love,last octob 'm feel fallout mom die cancer last...,found estrang dad thought love get know got kn...,tell dad love,last octob 'm feel fallout mom die cancer last...,found estrang dad thought love get know get kn...,tell dad love
2,55.0,10.0,0.85,55.0,so i had the brilliant idea to use veet hair r...,had my balls burned by sauron and was left dev...,i was deveeted...,so i had the brilliant idea to use veet hair r...,had my balls burned by sauron and was left dev...,i was deveeted,brilliant idea use veet hair removal gel ol ' ...,balls burned sauron left deveeted,deveeted,brilliant idea use veet hair remov gel ol ' da...,ball burn sauron left deveet,deveet,brilliant idea use veet hair remov gel ol ' da...,ball burn sauron left deveet,deveet
3,90.0,20.0,0.92,90.0,today i was going to have a bath after a long ...,peppermint + bath = burning cold ladybits.,wanting a pepperminty bath.,today i was going to have a bath after a long ...,peppermint bath burning cold ladybits,wanting a pepperminty bath,going bath long day painting kids faces carniv...,peppermint bath burning cold ladybits,wanting pepperminty bath,go bath long day paint kid face carniv would g...,peppermint bath burn cold ladybit,want pepperminti bath,go bath long day paint kid face carniv would g...,peppermint bath burn cold ladybit,want pepperminti bath
4,81.0,18.0,0.79,81.0,"i haven't had a bath in practically years so, ...","got too high and too hot in the bath, almost c...",having a spliff in the bath.,i haven't had a bath in practically years so a...,got too high and too hot in the bath almost co...,having a spliff in the bath,n't bath practically years finally moving hous...,got high hot bath almost cooked like lobster,spliff bath,n't bath practic year final move hous bath dec...,got high hot bath almost cook like lobster,spliff bath,n't bath practic year final move hous bath dec...,get high hot bath almost cook like lobster,spliff bath


## Save results

In [32]:
#df[["ups", "num_comments", "upvote_ratio", "score", "documents_stemmed", "tldr_stemmed", "title_stemmed"]].to_pickle("data/tifu_cleaned.pkl")
df[["ups", "num_comments", "upvote_ratio", "score", "documents", "tldr", "title"]].to_pickle("../data/tifu_cleaned.pkl")

In [65]:
df_nsfw = pd.read_excel('../data/data_nsfw.xlsx')

In [66]:
df_nsfw = df_nsfw[df_nsfw['isAd'] == False][['body', 'numberOfComments', 'over18', 'upVotes']]


In [80]:
df_nsfw['body'] = df_nsfw['body'].str.replace(';', '')
df_nsfw['body'] = df_nsfw['body'].apply(lambda x: re.sub(r'TL;?DR:?.*', '', x, flags=re.IGNORECASE))


In [40]:
df_nsfw.head(5)

Unnamed: 0,body,numberOfComments,over18,upVotes
0,didnt happen like year ago feel like fever dre...,95,False,392
1,f alway say 's ugli old stuff like nobodi like...,244,True,867
2,hi im f dental assist clean accident get prick...,265,False,920
3,context wife f parent beauti toddler ym f rent...,138,True,1372
4,yesterday 's fuckup spill go crazi f frequent ...,31,False,45


In [82]:
# clear out values: 
df_nsfw["body"] = df_nsfw["body"].apply(lambda x: regex_clean(x) if isinstance(x, str) else x)
df_nsfw["body"] = df_nsfw["body"].apply(lambda x: remove_stopwords(text=x, word_tokenizer=word_tokenize, stop_words=stop_words))
df_nsfw["body"] = df_nsfw["body"].apply(stemming)
df_nsfw["body"] = df_nsfw["body"].apply(lemmatize_text)

In [81]:
print(df_nsfw.iloc[3,0])

For context, wife (34f) and I (30m) are parents to a beautiful toddler (1y6m F) and rent a house is what I like to call a landlord special. And this happened two days ago. 

My wife and I have been planning a date night all week. I surprised her by saying I’d like to go to the movies, which I don’t do. I was never a fan of the movies before and haven’t been to a theatre since pre-Covid. I’ve taken the theaters for granted and I genuinely miss them- silly me. 

I had the Saturday off, she went to work and dropped off our daughter in the morning at her mom’s so my MIL could watch her while we go on our date night. Great! Night to ourselves. 

Wife arrives home around 2pm. Movie starts at 6:30pm and decided we eat before the movies. We mutually agreed we leave at 4pm. So I told her take your time to get ready. So she did, and I stopped and asked, “hey so can you shave down there )”. Wife was like “I got you )”. I have time to kill. So I decided to hop on my PC and log onto a Squad server 

In [83]:
df_nsfw.to_pickle("../data/nsfw_cleaned.pkl")