# #Topic Modeling using LDA

In [18]:
#Loading the Dataset
import pandas as pd

# Assigning the column names

colnames=['Target', 'Ids', 'Date', 'Flag','UserName','Text'] 
Twitter_Data = pd.read_csv(r"C:\Users\MOHANRAJ\Desktop\Projects\GUVI Final Project\NLP_LDA_Topic Handling\twitter_new.csv",encoding='latin-1', names=colnames, header=None)
Twitter_Data.head()

Unnamed: 0,Target,Ids,Date,Flag,UserName,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# #Data Cleaning

In [7]:
# Load the regular expression library
import re

# Remove punctuation
Twitter_Data['Text'] = \
Twitter_Data['Text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
Twitter_Data['Text'] = \
Twitter_Data['Text'].map(lambda x: x.lower())

# Print out the first rows of papers
Twitter_Data['Text'].head()

0    @switchfoot http://twitpiccom/2y1zl - awww tha...
1    is upset that he can't update his facebook by ...
2    @kenichan i dived many times for the ball mana...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no it's not behaving at all i...
Name: Text, dtype: object

In [3]:
# Removing Stopword and Tokenization

import gensim
from gensim.utils import simple_preprocess

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

data = Twitter_Data.Text.values.tolist()
data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)

print(data_words[:1][0][:30])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MOHANRAJ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['switchfoot', 'http', 'twitpiccom', 'zl', 'awww', 'bummer', 'shoulda', 'got', 'david', 'carr', 'third', 'day']


In [4]:
# Creating token using id2word and load it to the Corpus
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]


# # LDA Model Building

In [5]:
# Bulding LDA_Model pprint the Revalent Topics

from pprint import pprint

# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.027*"know" + 0.024*"really" + 0.021*"want" + 0.017*"lol" + 0.012*"think" '
  '+ 0.011*"haha" + 0.011*"love" + 0.011*"im" + 0.010*"like" + 0.010*"dont"'),
 (1,
  '0.022*"sleep" + 0.019*"please" + 0.012*"much" + 0.011*"get" + 0.011*"help" '
  '+ 0.010*"need" + 0.009*"talk" + 0.009*"got" + 0.009*"good" + 0.008*"go"'),
 (2,
  '0.055*"day" + 0.027*"good" + 0.021*"morning" + 0.018*"today" + '
  '0.015*"thank" + 0.015*"happy" + 0.014*"night" + 0.012*"nice" + '
  '0.012*"getting" + 0.010*"ready"'),
 (3,
  '0.070*"quot" + 0.035*"http" + 0.015*"love" + 0.013*"bitly" + '
  '0.013*"watching" + 0.011*"movie" + 0.010*"hi" + 0.007*"say" + 0.007*"new" + '
  '0.007*"one"'),
 (4,
  '0.032*"see" + 0.021*"love" + 0.018*"back" + 0.016*"get" + 0.014*"come" + '
  '0.012*"go" + 0.010*"never" + 0.010*"wait" + 0.008*"soon" + 0.008*"yes"'),
 (5,
  '0.044*"http" + 0.020*"twitpiccom" + 0.011*"welcome" + 0.009*"good" + '
  '0.008*"wish" + 0.008*"dinner" + 0.007*"coffee" + 0.007*"time" + '
  '0.007*"friend