In [1]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
#Download stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jorisballemans/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def clean_post(text):
    text = str(text)
    text = text.lower()
    text = re.sub('[!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@]+', " ", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("([0-9]+)", " ", text)
    tokens = [token.lower() for token in TweetTokenizer().tokenize(text) if token.isalpha() and token not in set(stopwords.words("english"))]
    tokens = [token for token in tokens if len(token) > 1]
    return " ".join(tokens)

def tokenize(text):
    tokens = TweetTokenizer().tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha() and token not in set(stopwords.words("english"))]
    return tokens

In [4]:
#Read the data into a dataframe
df = pd.read_csv("all_posts.csv")
df.head()
print(f"{len(df)} posts have been loaded...")
df["selftext"] = df["selftext"].apply(clean_post)
# df["tokenized_selftext"] = df["selftext"].apply(tokenize)
df.head()

4943 posts have been loaded...


Unnamed: 0,id,subreddit_id,created_utc,title,selftext,ups,downs
0,1gvppym,t5_2qirg,1732110000.0,I killed her.,good morning today transgender day remembrance...,1001.0,0.0
1,xi7em6,t5_2qirg,1663581000.0,6 months ago I posted on this subreddit beggin...,long road recovery tried kill get bed three mo...,680.0,0.0
2,junrt7,t5_2qirg,1605455000.0,"As a rape survivor, I want to let you know tha...",although still struggle raped nearly half year...,633.0,0.0
3,hdnt0e,t5_2qirg,1592812000.0,"I tried so hard to love this country, but afte...",indian india native american family born india...,441.0,0.0
4,n5fucd,t5_2qirg,1620221000.0,Best friend took her own life: the aftermath a...,november best friend committed suicide honestl...,439.0,0.0


In [71]:
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern="\w+|\$[\d\.]+|\S+")
tf = vectorizer.fit_transform(df["selftext"]).toarray()
tf_feature_names = vectorizer.get_feature_names_out()
number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

model.fit(tf)

In [72]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [73]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,said,141.4,like,2244.3,would,688.6,mental,569.8,job,1296.2,would,1008.9,life,1304.2,people,1319.6,anxiety,1063.8,black,88.8
1,even,134.9,feel,2030.2,mom,547.8,health,456.0,school,858.1,time,1006.8,time,974.0,life,723.5,like,801.6,one,85.8
2,work,122.5,know,1240.0,dad,542.5,day,330.1,work,834.4,like,967.3,things,727.2,know,699.5,feel,459.3,said,80.9
3,like,117.4,even,1149.8,family,536.7,https,297.0,degree,741.2,got,808.4,get,723.3,feel,625.7,get,381.7,like,71.0
4,dont,116.2,get,1126.5,time,502.0,com,257.0,years,713.0,never,795.5,people,643.8,like,603.8,feeling,356.9,call,70.0
5,know,107.6,want,1089.1,told,496.5,one,240.8,get,696.7,one,752.0,day,602.6,one,589.9,im,336.8,fucking,65.9
6,told,106.0,really,1058.2,one,483.2,help,213.1,time,683.6,started,731.6,want,594.2,love,579.0,people,330.1,car,61.3
7,would,102.2,life,985.4,back,444.6,time,210.7,life,639.1,felt,716.5,like,584.0,want,512.7,things,323.0,fuck,59.9
8,money,96.7,friends,808.5,got,439.3,work,187.7,like,632.6,friends,699.9,work,526.8,even,494.0,even,304.4,student,54.6
9,could,95.8,time,731.2,get,435.6,study,186.5,college,628.6,school,692.1,self,524.6,someone,447.5,think,303.4,white,54.4
