In [33]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [28]:
#Download stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jorisballemans/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
def clean_post(text):
    text = text.lower()
    text = re.sub('[!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@]+', " ", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("([0-9]+)", " ", text)
    tokens = [token.lower() for token in TweetTokenizer().tokenize(text) if token.isalpha() and token not in set(stopwords.words("english"))]
    return " ".join(tokens)

def tokenize(text):
    tokens = TweetTokenizer().tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha() and token not in set(stopwords.words("english"))]
    return tokens

In [None]:
#Read the data into a dataframe
df = pd.read_csv("all_posts.csv")
df.head()
print(f"{len(df)} posts have been loaded...")
df["selftext"] = df["selftext"].apply(clean_post)
# df["tokenized_selftext"] = df["selftext"].apply(tokenize)
df.head()

632 posts have been loaded...


Unnamed: 0,id,subreddit_id,created_utc,title,selftext,ups,downs
0,xi7em6,t5_2qirg,1663581000.0,6 months ago I posted on this subreddit beggin...,long road recovery tried kill get bed three mo...,681,0
1,junrt7,t5_2qirg,1605455000.0,"As a rape survivor, I want to let you know tha...",although still struggle raped nearly half year...,625,0
2,hdnt0e,t5_2qirg,1592812000.0,"I tried so hard to love this country, but afte...",indian india native american family born india...,445,0
3,n5fucd,t5_2qirg,1620221000.0,Best friend took her own life: the aftermath a...,november best friend f committed suicide hones...,440,0
4,1695ypm,t5_2qirg,1693770000.0,My girlfriend said “I love you” to her male fr...,girlfriend never heard guy looked shoulder guy...,366,0


In [46]:
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern="\w+|\$[\d\.]+|\S+")
tf = vectorizer.fit_transform(df["selftext"]).toarray()
tf_feature_names = vectorizer.get_feature_names_out()
number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

model.fit(tf)

In [56]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [57]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,you,868.6,was,2099.8,anxiety,332.1,you,12.6,me,838.9,you,2521.0,’t,851.3,t,1324.3,he,1196.7,have,1083.0
1,is,677.7,she,887.4,panic,90.2,your,10.9,t,653.3,your,728.7,’m,603.9,m,987.8,me,830.6,me,799.8
2,this,441.1,had,877.8,was,89.2,forward,7.3,was,615.2,is,491.3,’s,417.7,s,724.1,she,691.5,is,736.7
3,your,438.5,me,786.0,have,61.4,is,7.2,you,448.6,are,430.0,me,349.8,but,561.4,was,681.2,am,703.9
4,are,396.8,her,751.1,me,58.1,look,5.3,but,447.5,this,348.2,so,340.7,just,531.9,him,543.3,but,692.6
5,be,347.3,with,627.1,this,56.8,successful,5.1,they,429.5,have,312.7,but,339.6,ve,482.0,her,476.1,with,594.9
6,with,325.8,at,475.2,all,52.7,some,5.1,he,410.9,can,307.0,’ve,328.7,like,469.6,’t,429.0,not,538.6
7,or,298.9,this,472.0,attack,49.8,their,4.2,so,389.2,if,279.4,this,321.0,so,431.5,his,423.6,this,509.0
8,not,286.6,on,466.8,attacks,47.6,people,4.1,s,331.8,do,262.8,like,319.8,don,415.5,with,403.4,do,423.2
9,as,281.0,but,459.1,depression,45.1,failure,4.1,with,298.0,what,253.5,just,302.5,have,409.3,but,371.6,just,387.4
