## Steps for topic  modeling
- data loading
- data preprocesing
    - Tokenization
    - All stopwords are removed.
- Modeling


In [20]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ok\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [50]:
import pandas as pd
import os, sys
import re
import json
import glob
import datetime
from collections import Counter
from pprint import pprint


from nltk.corpus import stopwords
from wordcloud import WordCloud

from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()


import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ok\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [2]:
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from src.loader import SlackDataLoader
import src.utils as utils

In [3]:
# now lets define the path for week 8 and week 9
week_8_path = "C:/Users/ok/Desktop/Week_0/week-0/data/channels/weeks/all-week8"
week_9_path = "C:/Users/ok/Desktop/Week_0/week-0/data/channels/weeks/all-week9"

#now lets extract the data from the all-week8 and all-week9 folders
week_8 = utils.slack_parser(week_8_path)
week_9 = utils.slack_parser(week_9_path)
# loading the data
data = pd.concat([week_8, week_9])


In [24]:
def clean(doc):
    stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join([ch for ch in stop_free if ch not in exclude])
    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())

In [32]:
processed_docs = data['msg_content'].map(preprocess)

In [33]:
processed_docs

0      [good, morn, blush, welcom, week, wish, produc...
1                                           [good, morn]
2                                           [good, morn]
3                                           [good, morn]
4                                           [good, morn]
                             ...                        
105                                                   []
106                             [thank, python, version]
107    [https, realpython, linear, program, python, h...
108    [https, stackoverflow, question, distanc, geog...
109    [ujgp, nlsu, present, olutosin, kill, super, i...
Name: msg_content, Length: 189, dtype: object

In [52]:
data['cleaned_2'] = processed_docs

In [53]:
data

Unnamed: 0,msg_type,msg_content,sender_name,msg_sent_time,msg_dist_type,time_thread_start,reply_count,reply_users_count,reply_users,tm_thread_end,channel,cleaned,cleaned_2
0,message,Good morning everyone :blush: welcome to week ...,Garrett Bell,1665385707.569729,text,0,0,0,0,0,all-week8,Good morning everyone blush welcome to week 8 ...,"[good, morn, blush, welcom, week, wish, produc..."
1,message,Good morning!,Carlos Gross,1665385734.616309,text,0,0,0,0,0,all-week8,Good morning,"[good, morn]"
2,message,Good Morning.,Samuel King,1665385760.620169,text,0,0,0,0,0,all-week8,Good Morning,"[good, morn]"
3,message,Good Morning!!!,Daniel Brown,1665387214.795849,text,0,0,0,0,0,all-week8,Good Morning,"[good, morn]"
4,message,Good Morning.,Willie Yang,1665388127.826899,text,0,0,0,0,0,all-week8,Good Morning,"[good, morn]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,message,3.8,Judith Bolton,1666519434.902969,text,1666453751.515939,0,0,0,0,all-week9,38,[]
106,message,"Thank you, my python version was 3.10",Travis Butler,1666521758.493219,text,1666453751.515939,0,0,0,0,all-week9,Thank you my python version was 310,"[thank, python, version]"
107,message,<https://realpython.com/linear-programming-pyt...,Kelly Soto,1666544242.275809,link,0,0,0,0,0,all-week9,httpsrealpythoncomlinearprogrammingpythonHands...,"[https, realpython, linear, program, python, h..."
108,message,<https://stackoverflow.com/questions/70941094/...,Joshua Rhodes,1666552319.011779,link,0,0,0,0,0,all-week9,httpsstackoverflowcomquestions70941094howtoget...,"[https, stackoverflow, question, distanc, geog..."


In [31]:
stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [40]:
def data_preprocessor(data):
    
    #this function preprocesses the given data's message and create a new column 'cleaned' for topic and sentiment analysis
    data['cleaned'] = data['msg_content'].astype(str) #convertt to string
    data['cleaned'] = data['msg_content'].apply(lambda x: x.lower()) #convert to lower case
    data['cleaned'] = data['msg_content'].apply(lambda x: re.sub(r'http\S+', '', x)) # to remove links
    data['cleaned']= data['msg_content'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation))) #remove punctuations

    return data

def features(data):

    #this function changes sentences into list of words
    messages = [mess for mess in data['cleaned']]
    words = [mess.split() for mess in messages]

    #create dictionalry that containd ID and words 
    word_to_id = corpora.Dictionary(words) #generate unique tokens
    #  we can see the word to unique integer mapping
    # print(word_to_id.token2id)
    # using bag of words(bow), we create a corpus that contains the word id and its frequency in each document.
    corpus_1= [word_to_id.doc2bow(tweet) for tweet in words]
    # TFIDF

    return data,words, word_to_id, corpus_1


In [None]:
def features2(data):

    #this function changes sentences into list of words
    messages = [mess for mess in data['cleaned2']]
    words = [mess.split() for mess in messages]

    #create dictionalry that containd ID and words 
    word_to_id = corpora.Dictionary(words) #generate unique tokens
    #  we can see the word to unique integer mapping
    # print(word_to_id.token2id)
    # using bag of words(bow), we create a corpus that contains the word id and its frequency in each document.
    corpus_1= [word_to_id.doc2bow(tweet) for tweet in words]
    # TFIDF

    return data,words, word_to_id, corpus_1


In [37]:
data1 = data_preprocessor(data)

In [41]:
data2 = features(data1)

In [44]:
data, words, word2id, corous = data2

In [45]:
id_words = [[(word2id[id], count) for id, count in line] for line in corous]

In [48]:
 # Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corous,
                                           id2word=word2id,
                                           num_topics=5,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [51]:
pprint(lda_model.show_topics(formatted=False))

[(0,
  [('the', 0.030163215),
   ('Thank', 0.021695467),
   ('you', 0.015101622),
   ('or', 0.014796451),
   ('a', 0.014373034),
   ('in', 0.01290342),
   ('my', 0.012501105),
   ('of', 0.012345727),
   ('time', 0.011759203),
   ('I', 0.01157879)]),
 (1,
  [('the', 0.0617256),
   ('and', 0.022809105),
   ('to', 0.017554106),
   ('of', 0.016620956),
   ('we', 0.016083663),
   ('can', 0.015224413),
   ('trip', 0.014586046),
   ('I', 0.013482074),
   ('for', 0.013442016),
   ('that', 0.013403673)]),
 (2,
  [('it', 0.04236458),
   ('to', 0.019103276),
   ('is', 0.016692026),
   ('but', 0.012356782),
   ('I', 0.009710267),
   ('the', 0.009648425),
   ('of', 0.009419828),
   ('was', 0.0094194235),
   ('and', 0.009313228),
   ('in', 0.009055061)]),
 (3,
  [('the', 0.05654822),
   ('you', 0.026770767),
   ('to', 0.025092196),
   ('and', 0.0187004),
   ('of', 0.018077752),
   ('on', 0.015081719),
   ('I', 0.014895126),
   ('what', 0.011050259),
   ('submission', 0.010970979),
   ('is', 0.010945