# Extra code from Tuesday

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing

import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk import SnowballStemmer
from sklearn.decomposition import LatentDirichletAllocation

import matplotlib.pyplot as plt
import seaborn as sns

import pickle


# Reloading a trained pipeline

The model below is more-or-less the one we used in Tuesday's class. It's been saved as a "pickle", a method of storing python objects that are too complex to represent as a .csv file. You can see an example of how this was saved in the `serializeLDA.py` file in this folder.


For now, though, we'll just read in the model:


In [None]:
with open('newstopics15.pickle', 'rb') as file:
    # Load the data from the pickle file
    lda_pipeline = pickle.load(file)
    
lda= lda_pipeline['lda']                                      # get the LDA model
features = lda_pipeline['vectorizer'].get_feature_names_out() # get the words associated with each index

We can use this pipeline to get the topic distribution for new texts: 

In [None]:
newdoctopics = lda_pipeline.transform(['This is a text about the covid 19 virus.', 'This is about the border and immigration'])
newdoctopics

In [None]:
# getting the most common topic in each of the two documents
[np.argmax(i) for i in newdoctopics]

We can also retrieve the top terms associated with each topic using the `getTopicTerms` function in the text_functions script:

In [None]:
from text_functions import getTopicTerms
top_terms = getTopicTerms(lda, features, n_terms=10)
top_terms.head()


## Refitting a model

This is somewhat redundant, since we already trained the model on these documents, but the fitted document-topic distributions aren't saved in the `lda_pipeline` object, so we'll need to import the articles and then use `transform` to apply the topic model to our documents again.

First, we read in the articles:

In [None]:
articles = pd.read_csv('https://github.com/Neilblund/APAN/raw/main/news_sample.csv')
articles['headline'] = articles.headline.str.strip()
articles['hyperlink']=articles.apply(axis=1, func = lambda x: f'<a href={x.url}>{x.headline}</a>')

Next, we'll fit the LDA model to our articles:

In [None]:

doctopics = lda_pipeline.transform(articles['text'])
topic_memberships = pd.DataFrame(doctopics)
topic_memberships.columns = ["topic " + str(i)  for i in topic_memberships.columns ]

# Getting topics associated with a particular source

How would we identify the topics most strongly associated with Fox News or CNN? One way to do this would be to just group by source and then calculate the average % for each topic in Fox vs. CNN:

In [None]:
topics_by_source = pd.DataFrame(doctopics).groupby(articles['source']).mean(numeric_only=True).transpose().reset_index(names='topic')
topics_by_source

Better yet, we could calculate the logged ratio of "% topic k in Fox News articles compared to % topic k in CNN articles". This will make it so that topics more associated with Fox News will have a positive log-ratio, whereas topics associated with CNN will have a negative ratio.

In [None]:
topics_by_source['logratio'] = np.log(topics_by_source["Fox News"]/topics_by_source["CNN"])
topics_by_source

Now we can visualize the results as a bar graph. We'll also add some annotations to each bar to show the keywords for each topic and color-code the results to make things a little more visually interesting:

In [None]:
# get labels from the top terms associated with each topic: 
labels = top_terms.groupby('topic').head(n=5)[['keywords','topic']].groupby('topic').aggregate(lambda x: ', '.join(x))
topics_labeled = pd.merge(topics_by_source, labels, on = 'topic').sort_values('logratio').reset_index()

topics_labeled['topic'] = topics_labeled['topic'].astype('str')

# then create a plot using the log ratios for each topic 
fig, ax = plt.subplots(figsize=(10, 6))
g = sns.barplot(data=topics_labeled,
                 y= 'topic',    
                 hue='logratio',
                x='logratio',dodge=False, palette='viridis')
ax.set(xlabel='Topic associations CNN (negative values) \nvs.\n Fox News (positive values)', ylabel='topic')
g.legend_.remove()

for index, row in topics_labeled.iterrows():
    if row['logratio']<0:  
        ax.text(.01, index,row['keywords'], fontsize=9) #add tex
    else:
        ax.text(-.01, index,row['keywords'], fontsize=9, horizontalalignment = 'right') #add tex

# Making a custom table

Next, we might like to have a look at some example articles associated with each of our topics. We'll also use a little HTML formatting to make a hyperlink that you can click to read the article itself.


In [None]:
# adding the document-topic distribution to our original data frame of articles:
articles_with_topics = pd.concat([articles, topic_memberships], axis=1)


In [None]:


n_terms = 10
n_docs = 2
top_documents = []
top_index = topic_memberships.columns.values.tolist()[:15]
for i, label in enumerate(top_index):
    top_n_documents =  articles_with_topics.sort_values(label, ascending=False).head()
    terms={ 'topic' : i,
           'mean proportion' : np.mean(topic_memberships[label]),
        'docs' : '<br>'.join([i for i in top_n_documents['hyperlink'].to_list()[:n_docs]]),
        'terms' : ', '.join([features[j] for j in np.argsort(lda.components_[i])[::-1][:n_terms]]) 
    }
    top_documents.append(terms)
pd.DataFrame(top_documents).reset_index(drop=True).style


# Getting topics over time

I might also want to see how coverage changes over time. For instance, topic 7 is mostly related to abortion and Roe v. Wade. Maybe I want to see how coverage of that issue has increased or decreased over time, or identify where it peaks. Here's one way I could do that:

In [None]:
# converting the date to a date_time object type
articles_with_topics['pubdate'] = pd.to_datetime(articles_with_topics['date'])

# grouping by month and getting the average coverage for each topic by month/source
monthly_topic_coverage =articles_with_topics.groupby([ pd.Grouper(key='pubdate', freq='M'),'source']).mean(numeric_only=True)

Here's what the data looks like now:

In [None]:
monthly_topic_coverage.head()

Now I can make a line plot with publication month on the x-axis and average topic proportion on the y-axis:

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))
sns.lineplot(monthly_topic_coverage, x='pubdate', y='topic 7', hue='source')
plt.title('Abortion coverage by source');

# Plotting Documents by topic

Finally, I might want to visualize the entire corpus in a scatter plot. I'll use `TSNE` to reduce the dimensionality of my document-topic distribution from 15 columns down to just 2, then I can use these as the x and y coordinates in a scatter plot. I'll also color-code the results by topic:

In [None]:
from sklearn.manifold import TSNE

topics =topic_memberships.iloc[:,:15]
tsne = TSNE(random_state=999, perplexity=30, early_exaggeration=120)
embedding = tsne.fit_transform(topics)
embedding = pd.DataFrame(embedding, columns=['x','y'])
embedding['max_topic'] =np.array(topics).argmax(axis=1) # getting the topic most strongly associated with each document

In [None]:
topic_information = pd.concat([articles_with_topics, embedding], axis=1)


Now i'll make the scatter plot with Bokeh:

In [None]:
from bokeh.io import output_notebook
from bokeh.plotting import figure, show,  output_file, save
from bokeh.models import  ColorBar, LinearColorMapper, CrosshairTool, Span, BasicTicker
from bokeh.transform import transform
import bokeh.palettes
output_notebook()

In [None]:
# making a dictionary to map key-words to topics
topic_dictionary = dict(zip(topics_labeled['topic'].astype(int), topics_labeled['keywords']))

topic_information['topic_labels'] = topic_information['max_topic'].map(topic_dictionary)

In [None]:

# making a pallette to map colors to topics:
topic_colors = bokeh.palettes.d3['Category20b'][15]
color_dictionary = dict(zip(range(15), topic_colors))
topic_information['color'] = topic_information['max_topic'].map(color_dictionary)


In [None]:
# Customized HTML tooltip. The parts with an @colname will be filled in with data from my data frame.
TOOLTIPS = """
    <div style="width:400px;">
        <div>
        </div>
        <div>
            <span style="font-size: 17px; font-weight: bold;">@headline</span>
                <div>
                    <span>@date</span>
                </div>
            <br>
        </div>
        
        <div>
            <span style="font-size: 12px; color: #966;"><strong>Topic:</strong> @topic_labels</span>
        </div>
        <div>
            <span style="font-size: 12px"><strong>Source:</strong> @source</span>
        </div>
        <div>
            <span style="font-size: 15px;">Location</span>
            <span style="font-size: 10px; color: #696;">($x, $y)</span>
        </div>
    </div>
"""

#
p = figure(title="Fox and CNN articles",
           tooltips=TOOLTIPS,
           x_range=(min(topic_information.x)-1, max(topic_information.x)+1),
           y_range=(min(topic_information.y)-1, max(topic_information.y)+1),            
           width=2000, height=900,   
           x_axis_label="Dim 1",
           y_axis_label="Dim 2",
           toolbar_location='above') 
                                                        

# loop through each unique cluster in order. Doing this allows us to have an interactive legend on the plot
for i in topic_information.max_topic.sort_values().unique():
    data = topic_information[topic_information['max_topic'] == i]
    topic_label = topic_dictionary[i]
    p.scatter(x='x', y='y',  
             source=data,   
             legend_label = topic_label,
             #fill_color = 'cluster_color',
              marker = 'circle',
             color = 'color',
             
             line_color = 'black',
             alpha =.8,
             size=12)

p.legend.label_text_font_size = '20pt'

p.legend.click_policy="hide"
p.legend.location = "top_right"
p.legend.label_standoff = 30
p.add_layout(p.legend[0], 'right')

show(p)