In [None]:
from src.process_data import load_tweet_data, remove_duplicates, format_dates, timeframe, clean_text, create_dictionary
from src.train_lda import topic_eval, get_max_k, train_lda, get_doc_topic_matrix

In [None]:
pip list

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import string
import pandas as pd

nltk.download('stopwords')
nltk.download('wordnet')

# create stopwords
stop_words = set(stopwords.words('english'))
alphabet_remove = list(string.ascii_lowercase)

exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

number_remove = list(range(0, 9999))
number_remove = map(str, number_remove) 

stop_words = stop_words.union(number_remove, alphabet_remove)

# define lemmatizer object
lemma = WordNetLemmatizer()

# Process Data

In [None]:
tweet_data = load_tweet_data(data_path = 'data/external/constructs.csv')
#tweet_data = load_tweet_data(data_path = 'data/sample/tweets.csv')

In [None]:
tweet_data = remove_duplicates(tweet_data)

In [None]:
tweet_data_formatted = format_dates(tweet_data)

In [None]:
tweet_data_formatted, input_date = timeframe(tweet_data_formatted, '2020-01-01')

In [None]:
doc_clean = [clean_text(tweets, exclude, stop_words, lemma).split() for tweets in tweet_data_formatted['read_text_clean2']] 

In [None]:
dictionary, doc_term_matrix = create_dictionary(doc_clean)

# Train LDA Model

In [None]:
def topic_eval(doc_clean, doc_term_matrix, dictionary, top_k, input_date):
    """Evaluate the number of topics (k) to choose via the highest coherence score.
    
    Args: 
        doc_clean: dataframe - dataframe with processed text.
        doc_term_matrix: list - bag of words matrix with frequency of each term mapped to dictionary id. 
        dictionary: corpora.dictionary - dictionary mapping each term to it's integer id.
        top_k: int - max number of topics to test.
        input_date: str - date that the user selected to subset the data.
    
    Return: 
        lda_results: dataframe - of scores from the k topic evaluation
    """
    results = []
    
    for t in range(4, top_k):
        
        cov_model = LdaModel(corpus = doc_term_matrix, id2word = dictionary, num_topics = t, random_state=66826)

        cm = CoherenceModel(model=cov_model, dictionary=dictionary, texts=doc_clean, coherence='c_v')
        score = cm.get_coherence()
        tup = t, score
        results.append(tup)

    lda_results = pd.DataFrame(results, columns=['topic', 'score'])
    
    # save plots
    s = pd.Series(lda_results.score.values, index=lda_results.topic.values)
    
    pltk = s.plot()
    fig = pltk.get_figure()
    fig.savefig("app/static" + input_date + ".png")
    
    return lda_results

In [None]:
lda_results = topic_eval(doc_clean, doc_term_matrix, dictionary, top_k = 6, input_date = input_date)

In [None]:
lda_results

In [None]:
def get_max_k(lda_results):
    
    max_k = lda_results.iloc[lda_results['score'].argmax()][['topic']].astype(int)
    
    return max_k

max_k = get_max_k(lda_results)

In [None]:
def train_lda(max_k):
    """Train the lda model on the max K found during topic evaluation
    
    Args: max_k: int - specifies the number of topics the model should generate.
    
    Return: 
        cov_model: trained lda model object.
        coherence_score: int - return coherence score
    """
    
    cov_model = LdaModel(corpus = doc_term_matrix, id2word = dictionary, num_topics = max_k, random_state=66826)
    cm = CoherenceModel(model=cov_model, dictionary=dictionary, texts=doc_clean, coherence='c_v')
    coherence_score = cm.get_coherence()

    return cov_model, coherence_score

cov_model, coherence_score = train_lda(top_k)

In [None]:
doc_topic_max_df = get_doc_topic_matrix(cov_model)

In [None]:
results = []
top_k = 7
for t in range(4, top_k):
    cov_model = LdaModel(corpus = doc_term_matrix, id2word = dictionary, num_topics = top_k, random_state=66826)

    cm = CoherenceModel(model=cov_model, dictionary=dictionary, texts=doc_clean, coherence='c_v')
    score = cm.get_coherence()
    tup = t, score
    results.append(tup)

results = pd.DataFrame(results, columns=['topic', 'score'])
results

In [None]:
s = pd.Series(results.score.values, index=results.topic.values)
_ = s.plot()

In [None]:
pltk = s.plot()
fig = pltk.get_figure()
fig.savefig("figures/March_topic_k.png")
#pltk.savefig('figures/Jan_topic_k.png')

In [None]:
#max_k = results['score'].max()
#results.iloc[results.groupby('topic')['score'].agg(pd.Series.idxmax)]
#results.iloc[results.groupby('score').idxmax().values.ravel()]
#print(max_k)
max_k = results.iloc[results['score'].argmax()][['topic']].astype(int)
max_k

In [None]:
cov_model = LdaModel(corpus = doc_term_matrix, id2word = dictionary, num_topics = max_k, random_state=66826)

cm = CoherenceModel(model=cov_model, dictionary=dictionary, texts=doc_clean, coherence='c_v')
coherence = cm.get_coherence()
print(coherence)

In [None]:
#doc_topic_max_df = get_doc_topic_matrix(cov_model)

In [None]:
doc_topics = cov_model.get_document_topics(doc_term_matrix, minimum_probability=None, minimum_phi_value=None, per_word_topics=False)

In [None]:
def get_doc_topic_matrix(lda_model):
    """Caculates the topic probability of each tweet and then assigns the topic with the highest probability.
    
    Args: lda_model: lda object - trained lda model object.
    
    Return: 
        doc_topic_max_df: dataframe - dataframe containing a row for each tweet with a column 'topic_num' indicating the topic with the highest probability for that tweet.
    """
    
    doc_topics = lda_model.get_document_topics(doc_term_matrix, minimum_probability=None, minimum_phi_value=None, per_word_topics=False)

    doc_topic_max = []

    for d in range(len(doc_topics)):
        max_topic = max(doc_topics[d])
        topic_df = pd.DataFrame(max_topic).transpose()
        topic_df.columns = ['topic_num', 'prob']
        timeframe_slice = timeframe[['read_text_clean2','Perceived_susceptibility', 'Perceived_severity', 'Perceived_benefits', 'Perceived_barriers']].iloc[[d]]
        timeframe_slice = timeframe_slice.reset_index()
        topic_df = pd.concat([topic_df, timeframe_slice], axis=1, join="inner")
        del topic_df['index'] 
        doc_topic_max.append(topic_df)
        
    doc_topic_max_df = pd.concat(doc_topic_max)
    
    doc_topic_matrix = doc_topic_max_df.groupby(['topic_num'])['Perceived_susceptibility', 'Perceived_severity', 'Perceived_benefits',
                                                               'Perceived_barriers'].sum().reset_index()
    doc_topic_matrix['count'] = doc_topic_matrix['topic_num'].map(doc_topic_max_df['topic_num'].value_counts())
    
    return doc_topic_max_df

In [None]:
doc_topic_max_df = get_doc_topic_matrix(cov_model)

In [None]:
doc_topic_matrix.to_csv('data/timeframe_results/' + input_date + '_topic_matrix.csv', index = False)
doc_topic_matrix.head()

# Create a topics table 

We will have the following columns:

* User filter Date
* Topic_Num
* top terms (string) 

--Actually --- maybe I just store a few examples of the actual tweets and which topic they had the highest probability of belonging too? 

In [None]:
doc_topic_max_df_ordered = doc_topic_max_df.sort_values('prob', ascending=False)
top_tweets = doc_topic_max_df_ordered.groupby('topic_num').head(3)
top_tweets.sort_values('topic_num')

top_tweets['date'] = '2020-03-01'
top_tweets = top_tweets[['date', 'topic_num', 'prob', 'read_text_clean2', 'Perceived_susceptibility', 'Perceived_severity', 'Perceived_benefits', 'Perceived_barriers']]
top_tweets.columns = ['date', 'topic_num', 'prob', 'tweet', 'Perceived_susceptibility', 'Perceived_severity', 'Perceived_benefits', 'Perceived_barriers']

In [None]:
top_tweets.to_csv('data/top_tweets_march.csv', index = False)

# Create Images

In [None]:
import matplotlib.colors as mcolors
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
from matplotlib import pyplot as plt
%matplotlib inline


cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(background_color='white',
                  width=2500,
                  height=1800,
                  max_words=20,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

In [None]:
topics = cov_model.show_topics(formatted=False)

fig, axes = plt.subplots(4,2, figsize=(10,10), sharex=True, sharey=True)


for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    axes[3,1].set_axis_off()
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=20))
    plt.gca().axis('off')
    plt.savefig('figures/March_Model.png', facecolor='w')
    plt.gca().imshow(cloud)

# Save Dataframe to mysql table

Save doc_topic_matrix? Or do we save the topics with the top words? 

Columns: topic 1, word 1, word 2, and number of health beliefs 

In [None]:
import numpy as np
top_tweets.reset_index()
top_tweets['tweet_id'] = np.arange(len(top_tweets))

# add this part to not copy over already added tweets
top_tweets['tweet_id'] = top_tweets['tweet_id'] + 27

In [None]:
import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, MetaData
from sqlalchemy.orm import sessionmaker
from flask_sqlalchemy import SQLAlchemy

In [None]:
Base = declarative_base()

class topics(Base):
    """Create a data model for the database to store topics for time periods."""

    __tablename__ = 'topics'

    read_tweet_id = Column(Integer, primary_key=True)
    date = Column(String(10), primary_key=False)
    topic_num = Column(Integer, primary_key=False)
    prob = Column(Integer, primary_key=False)
    tweet = Column(String(300), primary_key=False)
    Perceived_susceptibility = Column(Integer, primary_key=False)
    Perceived_severity = Column(Integer, primary_key=False)
    Perceived_benefits = Column(Integer, primary_key=False)
    Perceived_barriers = Column(Integer, primary_key=False)

    def __repr__(self):
        return '<topics %r>' % self.tweet

def create_db(engine_string: str):
    """Create database from provided engine string.

    Args:
        engine_string: str - Engine string.

    Returns: 
        engine: sqlalchemy.engine.base.Engine - sqlalchemy connection from amazon rds  

    """
    #logger.debug("Create table and columns for raw data.")
    engine = sqlalchemy.create_engine(engine_string)

    Base.metadata.create_all(engine)
    
    return engine

In [None]:
conn_type = "mysql+pymysql"
user="admin"
password="PASSWORD HERE"
host="HOST HERE" 
port="3306"
db_name="msia423_db"

In [None]:
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String

engine_string = f"{conn_type}://{user}:{password}@{host}:{port}/{db_name}"

engine = create_db(engine_string)

In [None]:
top_tweets.to_sql(name='topics', con=engine, if_exists = 'append', index=False)

In [None]:
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String

engine_string = f"{conn_type}://{user}:{password}@{host}:{port}/{db_name}"

engine = create_engine(engine_string, echo = True)
meta = MetaData()

students = Table(
   'students', meta, 
   Column('id', Integer, primary_key = True)
)
meta.create_all(engine)