In [11]:
import pandas as pd
import numpy as np
import sys
import umap
import spacy
import json
import datetime
import zipfile
import scipy.stats
import scattertext as st
from sklearn.decomposition import PCA
from gensim.models import word2vec
import re
from glob import glob
from scipy.stats import rankdata
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
import matplotlib.pyplot as plt

In [2]:
zf = zipfile.ZipFile('2_May.zip')

In [3]:
data = []
for filename in zf.filelist:
    j = json.load(zf.open(filename))
    data.append({field:j[field] for field in ['content', 'link', 'published', 'author', 'title', 'source']})
df = pd.DataFrame(data)

In [4]:
df['parsed'] = df['content'].apply(st.whitespace_nlp_with_sentences)

In [5]:
corpus = st.CorpusFromParsedDocuments(df, parsed_col='parsed', category_col='source').build().get_unigram_corpus()

In [7]:
publisher_metadata = corpus.use_categories_as_metadata()

In [27]:
embeddings = publisher_metadata.get_term_freq_df('').values.astype('f')
embeddings = (embeddings - embeddings.mean(axis=1)[:,None])/embeddings.var(axis=1)[:,None]
pca_proj = PCA(n_components=20).fit_transform(embeddings.T)


In [29]:
projection = pd.DataFrame({'term': publisher_metadata.get_metadata(), 'x': pca_proj.T[0], 'y': pca_proj.T[1]}).set_index('term')
html = st.produce_pca_explorer(publisher_metadata, 
                               category='BBC', 
                               show_axes=False, 
                               use_non_text_features=True, 
                               show_top_terms=False, 
                               projection=projection,
                               max_docs_per_category=100)
file_name = 'publisher_projection_0_1.html'
open(file_name,'w').write(html)
IFrame(src=file_name, width = 1400, height=900)

In [37]:
tdf = publisher_metadata.get_term_freq_df()
for category in publisher_metadata.get_categories():
    print(category)
    tdf['score'] = publisher_metadata.get_scaled_f_scores(category)
    print(list(tdf.sort_values(by='score', ascending=False).iloc[:10].index))

Activist Post
['catherine', 'holistic', 'amazon', 'helmets', 'smart', 'soros', 'nsa', 'books', 'com', 'meters']
Addicting Info
['â', 'jackson', 'twitter', 'yates', 'fired', 'healthcare', 'donald', 'flynn', 'dickerson', 'trump']
Alternative Media Syndicate
['pipeline', 'dakota', 'spill', 'merck', 'ancient', 'honey', 'tribe', 'environmental', 'oil', 'spills']
AP
['ap', 'photo', 'associated', 'arabia', 'saudi', 'king', 'leaders', 'riyadh', 'quantico', 'trip']
BBC
['bbc', 'league', 'uk', 'season', 'ms', 'players', 'final', 'mr', 'says', 'macron']
Bipartisan Report
['tweet', 'apparently', 'however', 'yates', 'twitter', 'flynn', 'page', 'fact', 'ties', 'regarding']
Breitbart
['breitbart', 'texas', 'follow', 'israel', 'immigration', 'jerusalem', 'reported', 'county', 'twitter', 'percent']
Business Insider
['xi', 'transcript', 'murray', 'zuma', 'hett', 'china', 'trade', 'macron', 'rules', 'cruz']
CBS News
['cbs', 'p', 'clapper', 'dickerson', 'yates', 'mr', 'm', 'sen', 'john', 'reports']
CNBC
[

['examiner', 'healthcare', 'r', 'washington', 'sen', 'percent', 'gop', 'senate', 'obamacare', 'secrets']
Yahoo News
['yahoo', 'photo', 'duterte', 'sen', 'tuesday', 'press', 'r', 'comey', 'spicer', 'wednesday']
Young Conservatives
['check', 'holt', 'voter', 'testify', 'id', 'memorial', 'devos', 'breaking', 'looks', 'convention']
BuzzFeed
['espn', 'buzzfeed', 'russo', 'perez', 'jacobs', 'gianforte', 'heritage', 'letters', 'sessions', 'passion']
NODISINFO
['arch', 'staged', 'wounds', 'purported', 'hoax', 'zionist', 'wound', 'scam', 'moulage', 'imagery']
Prntly
['yahya', 'tania', 'nordic', 'arabic', 'abu', 'cerantonio', 'denmark', 'caliphate', 'jihadist', 'jihadists']
Slate
['stephens', 'ahca', 'heritage', 'medicaid', 'mensch', 'caucus', 'macarthur', 'upton', 'cuts', 'cbo']
The Burrard Street Journal
['neighbour', 'bsj', 'terrific', '45th', 'canada', 'shaking', 'goofball', 'goaded', 'undervaluing', 'amigo']
The Borowitz Report
['borowitz', 'slamming', 'temp', 'dorrinson', 'spies', 'alert',