In [4]:
import pandas as pd
import os, sys
import re
import json
import glob
import datetime
from collections import Counter
from pprint import pprint

import nltk

nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from wordcloud import WordCloud

import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ok\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
#lets get started by importing the new data
data = pd.read_csv('C:/Users/ok/Desktop/Week_0/week-0/data/all_data.csv', encoding='utf-8')


In [44]:
#copy the message got further cleaning
data['cleaned'] = data['msg_content']

In [52]:
def data_preprocessor(data):
    
    #this function preprocesses the given data's message and create a new column 'cleaned' for topic and sentiment analysis
    data['cleaned'] = data['cleaned'].astype(str) #convertt to string
    data['cleaned'] = data['cleaned'].apply(lambda x: x.lower()) #convert to lower case
    data['cleaned'] = data['cleaned'].apply(lambda x: re.sub(r'http\S+', '', x)) # to remove links
    data['cleaned']= data['cleaned'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation))) #remove punctuations
    data['cleaned'] = data['cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove stopwords
    data['cleaned'] = data['cleaned'].apply(lambda x: ' '.join([word for word in x.split() if len(word)>3])) #remove words with length less than 3
    data['cleaned'] = data['cleaned'].apply(lambda x: re.sub(r'\d+', '', x))
    
    return data

def features(data):

    #this function changes sentences into list of words
    messages = [mess for mess in data['cleaned']]
    words = [mess.split() for mess in messages]

    #create dictionalry that containd ID and words 
    word_to_id = corpora.Dictionary(words) #generate unique tokens
    #  we can see the word to unique integer mapping
    # print(word_to_id.token2id)
    # using bag of words(bow), we create a corpus that contains the word id and its frequency in each document.
    corpus_1= [word_to_id.doc2bow(word) for word in words]
    # TFIDF

    return data,words, word_to_id, corpus_1


In [31]:
stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [50]:
#clean the message content
data1 = data_preprocessor(data)

In [54]:
#process the cleaned data into features
data2 = features(data1)
data, words, word2id, corous = data2

In [55]:
id_words = [[(word2id[id], count) for id, count in line] for line in corous]

In [56]:
 # Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corous,
                                           id2word=word2id,
                                           num_topics=5,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [57]:
pprint(lda_model.show_topics(formatted=False))

[(0,
  [('task', 0.03700923),
   ('data', 0.035471413),
   ('using', 0.021934109),
   ('great', 0.016249854),
   ('done', 0.014091503),
   ('know', 0.0134693505),
   ('could', 0.012234431),
   ('would', 0.012232722),
   ('already', 0.010701083),
   ('check', 0.009572877)]),
 (1,
  [('time', 0.060443614),
   ('working', 0.040980726),
   ('please', 0.036029268),
   ('right', 0.03128182),
   ('google', 0.019797508),
   ('rollingonthefloorlaughing', 0.018444775),
   ('call', 0.017624373),
   ('today', 0.01565977),
   ('mean', 0.01544796),
   ('meet', 0.015406777)]),
 (2,
  [('work', 0.02618495),
   ('link', 0.023628084),
   ('thanks', 0.021462074),
   ('dont', 0.01943454),
   ('like', 0.018558834),
   ('repo', 0.017506672),
   ('going', 0.014438911),
   ('tasks', 0.01326881),
   ('need', 0.0127672255),
   ('lets', 0.011932013)]),
 (3,
  [('think', 0.041726463),
   ('guys', 0.0339931),
   ('meeting', 0.026752032),
   ('good', 0.019560965),
   ('also', 0.017084975),
   ('create', 0.01696764)