In [48]:
import re
import glob
import nltk
import numpy as np
import pandas as pd
from pprint import pprint

In [89]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.matutils import cossim

In [7]:
import spacy

In [12]:
import pyLDAvis
import pyLDAvis.gensim

In [13]:
import matplotlib.pyplot as plt
%matplotlib inline

In [30]:
docs = glob.glob("./result/clean_tweets/*.csv")

In [70]:
frames = []
for doc in docs:
    df = pd.read_csv(doc, sep='\t')
    frames.append(df)

In [71]:
result = pd.concat(frames)

In [141]:
result.shape

(3073659, 4)

In [77]:
dates = {}
for i in range(result.shape[0]):
    date = result["created_at"].iloc[i]
    year, month, day = date.split("-")
    
    if year not in dates:
        dates[year] = {}
    
    if month not in dates[year]:
        dates[year][month] = {}
    
    if day not in dates[year][month]:
        dates[year][month][day] = 0
    
    dates[year][month][day] += 1

In [17]:
df = pd.read_csv(doc, sep='\t')

In [31]:
x = df["cleaned_text"].loc[0].split()

In [137]:
x

['@pamonshaw', '@ninawmakeup', 'nic', 'mem']

In [34]:
xs = [df["cleaned_text"].loc[i].split() for i in range(df.shape[0])]

In [138]:
def word_pairs(sen):
    items = sen
    return ["_".join([items[i],items[j]]) for i in range(len(items)) for j in range(i+1, len(items))]

In [139]:
word_pairs(x)

['@pamonshaw_@ninawmakeup',
 '@pamonshaw_nic',
 '@pamonshaw_mem',
 '@ninawmakeup_nic',
 '@ninawmakeup_mem',
 'nic_mem']

In [37]:
id2word = corpora.Dictionary(xs)

In [39]:
texts = xs
corpus = [id2word.doc2bow(text) for text in texts]

In [40]:
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1)]]


In [44]:
id2word[3]

'nic'

In [47]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [86]:
lda_model.print_topics()

[(0,
  '0.157*"lip" + 0.119*"colo" + 0.102*"skin" + 0.058*"ton" + 0.045*"light" + 0.037*"pink" + 0.031*"brown" + 0.029*"want" + 0.021*"gold" + 0.014*"might"'),
 (1,
  '0.267*"xo" + 0.231*"ye" + 0.168*"day" + 0.064*"@littlethingsto" + 0.030*"@tserings" + 0.014*"@gailsiemko" + 0.009*"@catlovertamjo" + 0.009*"tamr" + 0.005*"gail" + 0.000*"christmas"'),
 (2,
  '0.561*"@ninawmakeup" + 0.056*"canad" + 0.049*"nic" + 0.033*"agr" + 0.025*"roug" + 0.020*"get" + 0.019*"pretty" + 0.016*"hug" + 0.016*"@panagiota_karag" + 0.013*"neut"'),
 (3,
  '0.407*"thank" + 0.256*"gre" + 0.121*"much" + 0.054*"perfect" + 0.015*"gift" + 0.000*"holiday" + 0.000*"christmas" + 0.000*"merry" + 0.000*"2019" + 0.000*"glad"'),
 (4,
  '0.397*"good" + 0.287*"morn" + 0.047*"luck" + 0.004*"@irenek24" + 0.004*"ir" + 0.000*"thursday" + 0.000*"enjoy" + 0.000*"friend" + 0.000*"may" + 0.000*"dear"'),
 (5,
  '0.322*"lol" + 0.193*"look" + 0.067*"see" + 0.040*"week" + 0.014*"launch" + 0.011*"forward" + 0.001*"davelackiebeauty" + 0.0

In [53]:
doc_lda = lda_model[corpus]

In [54]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))


Perplexity:  -16.061567172264496


In [56]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4744900323380958


In [57]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [87]:
lda_model.show_topic(0)

[('lip', 0.15747344),
 ('colo', 0.11921731),
 ('skin', 0.10162249),
 ('ton', 0.05793143),
 ('light', 0.04516961),
 ('pink', 0.03723994),
 ('brown', 0.03123095),
 ('want', 0.02937598),
 ('gold', 0.020900657),
 ('might', 0.014313835)]

In [95]:
cossim(lda_model.show_topic(0), lda_model.show_topic(0))

0.9999999999999999

In [97]:
dates.keys()

dict_keys(['2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008'])

In [100]:
dates["2018"]["04"]

{'30': 2387,
 '27': 2235,
 '26': 2385,
 '24': 2505,
 '23': 2565,
 '21': 1544,
 '15': 1620,
 '14': 1602,
 '13': 2336,
 '12': 2261,
 '08': 1445,
 '04': 2069,
 '29': 1533,
 '28': 1537,
 '25': 2287,
 '22': 1476,
 '20': 2380,
 '19': 2384,
 '18': 2396,
 '17': 2260,
 '16': 2261,
 '11': 2428,
 '10': 2318,
 '09': 2279,
 '07': 1372,
 '06': 2090,
 '05': 2119,
 '03': 2057,
 '02': 1803,
 '01': 1315}

In [119]:
def train_lda(df, num_topics=20):
    df = df.dropna()
    xs = [df["cleaned_text"].iloc[i].split() for i in range(df.shape[0])]
    
    id2word = corpora.Dictionary(xs)
    texts = xs
    corpus = [id2word.doc2bow(text) for text in texts]
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    return lda_model

In [120]:
doc = result[result["created_at"] == "2018-04-30"]
lda_2018_04_30 = train_lda(doc)

In [121]:
doc = result[result["created_at"] == "2018-04-27"]
lda_2018_04_27 = train_lda(doc)

In [128]:
topics = {}
for i in range(20):
    vec_1 = lda_2018_04_30.show_topic(i)
    
    top_sim = -1
    for j in range(20):
        vec_2 = lda_2018_04_27.show_topic(j)
        sim = cossim(vec_1, vec_2)
        if sim > top_sim:
            top_sim = sim
            topics[i] = (j, sim)

In [None]:
from gensim.test.utils import datapath

In [154]:
topic_model_dates = []
topic_models = []
for i, year in enumerate(dates.keys()):
    if int(year) < 2016:
        continue
    for j, month in enumerate(dates[year].keys()):
        for k, day in enumerate(dates[year][month].keys()):
            print(i, j / len(dates[year].keys()), k / len(dates[year][month].keys()))
            date = "{}-{}-{}".format(year, month, day)
            doc = result[result["created_at"] == date]
            model = train_lda(doc)
            
            temp_file = datapath(date)
            lda.save(temp_file)
            
            topic_model_dates.append(date)
            topic_models.append(model)

0 0.0 0.0
0 0.0 0.14285714285714285
0 0.0 0.2857142857142857
0 0.0 0.42857142857142855
0 0.0 0.5714285714285714
0 0.0 0.7142857142857143
0 0.0 0.8571428571428571
0 0.5 0.0
0 0.5 0.03225806451612903


KeyboardInterrupt: 

In [None]:
def cossim_pairs(topic_models, num_topics=20):
    topic_cos_map = {}
    
    for i, m in enumerate(topic_models):
        for u in range(num_topics):
            for j, n in enumerate(topic_models):
                if i == j:
                    continue
                top_cs = -1
                top_topic = ""
                for v in range(num_topics):
                    cs = cossim(m.show_topic(u), n.show_topic(v))
                    if cs > top_cs:
                        top_cs = cs
                        top_topic = "{}:{}_{}:{}".format(i, u, j, v)
                
                topic_cos_map[top_topic] = top_cs