# Process tweets

In [None]:
import numpy as np
import pandas as pd
import timeit
from datetime import datetime
from datetime import timedelta
import gensim
from gensim import corpora
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
#import pyLDAvis
#import pyLDAvis.sklearn
#import matplotlib.pyplot as plt
%matplotlib inline

nltk.download('stopwords')
nltk.download('wordnet')
#nltk.download('punkt')

In [None]:
tweet_data = pd.read_csv('../data/external/constructs.csv')
tweet_data.head()

In [None]:
len(tweet_data)

## Format Dates

In [None]:
tweet_data['date'] = tweet_data['created_at'].str.split(' ').str[1:3]

In [None]:
tweet_data['date'] = tweet_data['date'].str.join(' ')
tweet_data['date'] = tweet_data['date'].astype(str)

In [None]:
tweet_data['date'] = pd.to_datetime(tweet_data['date'] + ' 2020', format='%b %d %Y', errors='coerce')

## Select Time Frame of Interest

We should have two time frames and allow users to compare the differences in topics 

In [None]:
usr_input = pd.to_datetime('2020-01-01')

timeframe1 = tweet_data[(tweet_data['date'] >= usr_input) & (tweet_data['date'] < (usr_input + timedelta(days=30)))]

## Tokenize Text

In [None]:
stop_words = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# additional characters to remove
def number_list(r1, r2):
    return list(range(r1, r2+1))

alphabet_remove = list(string.ascii_lowercase)

stop_words = stop_words.union(number_remove, alphabet_remove)

In [None]:
stop_words

Code adapted https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

In [None]:
def clean(tweets):
    stop_free = " ".join([i for i in tweets.lower().split() if i not in stop_words])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [None]:
start = timeit.timeit()

doc_clean = [clean(tweets).split() for tweets in timeframe['read_text_clean2']] 
end = timeit.timeit()

print((end - start)/60)

In [None]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

## Train LDA Model

In [None]:
results = []

for t in range(2, 20):
    cov_model = LdaModel(corpus = doc_term_matrix, id2word = dictionary, num_topics = t, random_state=66826)

    cm = CoherenceModel(model=cov_model, dictionary=dictionary, texts=doc_clean, coherence='c_v')
    score = cm.get_coherence()
    tup = t, score
    results.append(tup)

results = pd.DataFrame(results, columns=['topic', 'score'])
results

In [None]:
s = pd.Series(results.score.values, index=results.topic.values)
_ = s.plot()

In [None]:
cov_model = LdaModel(corpus = doc_term_matrix, id2word = dictionary, num_topics = 14, random_state=66826)

cm = CoherenceModel(model=cov_model, dictionary=dictionary, texts=doc_clean, coherence='c_v')
coherence = cm.get_coherence()

In [None]:
print(coherence)
cov_model.print_topics(num_topics=4, num_words=20)

In [None]:
cov_model.show_topics(num_topics = -1, num_words=20, formatted=False)

Get topics for each document

In [None]:
doc_topics = cov_model.get_document_topics(doc_term_matrix, minimum_probability=None, minimum_phi_value=None, per_word_topics=False)

In [None]:
doc_topic_max = []

for d in range(len(doc_topics)):
    max_topic = max(doc_topics[d])
    topic_df = pd.DataFrame(max_topic).transpose()
    topic_df.columns = ['topic_num', 'prob']
    timeframe_slice = timeframe[['read_text_clean2','Perceived_susceptibility', 'Perceived_severity', 'Perceived_benefits', 'Perceived_barriers']].iloc[[d]]
    timeframe_slice = timeframe_slice.reset_index()
    topic_df = pd.concat([topic_df, timeframe_slice], axis=1, join="inner")
    del topic_df['index'] 
    doc_topic_max.append(topic_df)

In [None]:
doc_topic_max_df = pd.concat(doc_topic_max)
doc_topic_max_df

In [None]:
top_topics

In [None]:
doc_topic_matrix = doc_topic_max_df.groupby(['topic_num'])['Perceived_susceptibility', 'Perceived_severity', 'Perceived_benefits', 'Perceived_barriers'].sum().reset_index()
doc_topic_matrix['count'] = doc_topic_matrix['topic_num'].map(doc_topic_max_df['topic_num'].value_counts())
doc_topic_matrix

In [None]:
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(background_color='black',
                  width=2500,
                  height=1800,
                  max_words=20,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

In [None]:
topic_words

In [None]:
topics[11:14]

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[10:14][i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

If I were to subset Topic 1, what are the most frequent health belief classifications?

# Save Dataframe