In [7]:
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF as NMF_sklearn
import pickle
from nmf import NMF
import pymongo
from pymongo import MongoClient
import pickle
def build_text_vectorizer(contents, use_tfidf=True, use_stemmer=False, max_features=None):
    '''
    Build and return a **callable** for transforming text documents to vectors,
    as well as a vocabulary to map document-vector indices to words from the
    corpus. The vectorizer will be trained from the text documents in the
    `contents` argument. If `use_tfidf` is True, then the vectorizer will use
    the Tf-Idf algorithm, otherwise a Bag-of-Words vectorizer will be used.
    The text will be tokenized by words, and each word will be stemmed iff
    `use_stemmer` is True. If `max_features` is not None, then the vocabulary
    will be limited to the `max_features` most common words in the corpus.
    '''
    
    Vectorizer = TfidfVectorizer if use_tfidf else CountVectorizer
    tokenizer = RegexpTokenizer(r"[\w']+")
    stem = PorterStemmer().stem if use_stemmer else (lambda x: x)
    stop_set = set(stopwords.words('english'))

    # Closure over the tokenizer et al.
    def tokenize(text):
        tokens = tokenizer.tokenize(text)
        stems = [stem(token) for token in tokens if token not in stop_set]
        return stems

    vectorizer_model = Vectorizer(tokenizer=tokenize, max_features=max_features)
    vectorizer_model.fit(contents)
    vocabulary = np.array(vectorizer_model.get_feature_names())

    # Closure over the vectorizer_model's transform method.
    def vectorizer(X):
        return vectorizer_model.transform(X).toarray()

    return vectorizer, vocabulary


def softmax(v, temperature=1.0):
    '''
    A heuristic to convert arbitrary positive values into probabilities.
    See: https://en.wikipedia.org/wiki/Softmax_function
    '''
    expv = np.exp(v / temperature)
    s = np.sum(expv)
    return expv / s


def hand_label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    hand_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:20]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_five]))
        label = input('please label this topic: ')
        hand_labels.append(label)
        print()
    return hand_labels


def analyze_article(article_index, contents, web_urls, W, hand_labels):
    '''
    Print an analysis of a single NYT articles, including the article text
    and a summary of which topics it represents. The topics are identified
    via the hand-labels which were assigned by the user.
    '''
    #print(web_urls[article_index])
    #print(contents[article_index])
    probs = softmax(W[article_index], temperature=0.01)
    
    top_prob = 0
    top_cat = 0
    i = 0
    for prob, label in zip(probs, hand_labels):
        if prob > top_prob:
            top_prob = prob
            top_cat = i
        #print('--> {:.2f}% {}'.format(prob * 100, label))
        i = i + 1
    return top_cat, probs
    #print()
    #    gotta assign all of these to the correct bin and then tfidf them 


In [7]:
df1 = pd.read_csv('data/all-the-news/articles1.csv')

In [8]:
df1.columns

Index(['Unnamed: 0', 'id', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

In [3]:
g_df1 = pd.read_csv('data/all-the-news/articles1.csv')
g_df2 = pd.read_csv('data/all-the-news/articles2.csv')
g_df3 = pd.read_csv('data/all-the-news/articles3.csv')

g_arr1 = g_df1.values
g_arr2 = g_df2.values
g_arr3 = g_df3.values

g_df_full = g_df1.append(g_df2).append(g_df3)
g_df_reset=g_df_full.reset_index(drop=True)
g_df_reset.head()
g_df_reset.drop('Unnamed: 0', axis=1, inplace=True)
g_df_full = g_df_reset

In [45]:
joined_array = np.concatenate((g_arr1,g_arr2,g_arr3),axis=0)

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [89]:
g_df_reset.drop('Unnamed: 0', axis=1, inplace=True)
g_df_reset.head()

Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [95]:
g_df_reset.shape

(142570, 9)

In [47]:
joined_array[0]

array([0, 17283,
       'House Republicans Fret About Winning Their Health Care Suit - The New York Times',
       'New York Times', 'Carl Hulse', '2016-12-31', 2016.0, 12.0, nan,
       'WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that 

In [44]:
g_arr2.shape

(49999, 10)

In [16]:
g_probs_for_clusters

array([0.02576601, 0.61020692, 0.05870463, 0.22437147, 0.02064656,
       0.02707373, 0.03323069])

In [21]:
{ i:g_probs_for_clusters[i] for i in range(0,len(g_probs_for_clusters))}[1]

SyntaxError: invalid syntax (<ipython-input-21-83b7215324b4>, line 1)

In [99]:
g_df_full.shape

(142570, 9)

'Guardian'

In [None]:
bane.update

In [4]:
##############################################################################
#fix
#g_df1 = g_df_full #pd.read_csv('data/all-the-news/articles1.csv')


reconstruction error: 358.7721134427164
topic 0
--> says like people one women time get think even way know life would first new years really us world going

topic 1
--> trump donald president republican campaign house cruz said republicans white election gop would presidential party obama nominee ryan news administration

topic 2
--> u russia said syria obama military united president russian korea north syrian government islamic state iran isis security states war

topic 3
--> percent company billion said 1 u million market companies year 2 tax new bank growth would 5 0 health investors

topic 4
--> clinton sanders hillary democratic campaign voters state presidential bernie party emails election email fbi obama candidate democrats vote nominee comey

topic 5
--> police said officers court officer told shooting man department city law authorities shot killed gun according county video arrested investigation

topic 6
--> mr ms said mrs united j new would _____ f york b party states fo

KeyboardInterrupt: 

In [11]:
for i in range(0*len(g_rand_articles),1*len(g_rand_articles)):
    #print('before db insert')
    g_clus, g_probs_for_clusters= analyze_article(i, g_contents, g_web_urls, g_W, g_hand_labels)

    cl = MongoClient()
    g_coll = cl["cluster_all"]["seven_clusters"]

    data = {"_id" : i, "content" : str(g_contents[i]), "highest_cluster" : g_clus}
    for i in range(0,len(g_probs_for_clusters)):
        data[str(i)] = g_probs_for_clusters[i]
    g_coll.insert_one(data)


(142570, 10)

In [58]:
g_coll

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'cluster_all'), 'seven_clusters')

In [96]:
#g_df_full.drop(columns=['Unnamed:'])

In [18]:
g_probs_for_clusters[1]

0.0931962467000906

In [31]:
len(g_contents)

142570

In [36]:
len(g_contents)

142570

In [48]:
list('AB')

['A', 'B']

In [None]:
df 

In [None]:
def 

In [60]:
test_df = pd.read_csv('data/all-the-news/articles3.csv')

Index(['Unnamed: 0', 'id', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

In [62]:
test_X = g_vectorizer(test_df.content)

In [63]:
test_X.shape

(42571, 5000)

In [75]:
def merge_files(f1name,f2name,f3name):
    with open(f1name) as f1:
        cont1 = f1.readlines()
        print(len(cont1))
    with open(f2name) as f2:
        cont2 = f2.readlines()
        print(len(cont2))
    with open(f3name) as f3:
        cont3 = f3.readlines()
        print(len(cont3))
# you may also want to remove whitespace characters like `\n` at the end of each line


In [76]:
merge_files('data/all-the-news/articles1.csv','data/all-the-news/articles2.csv','data/all-the-news/articles3.csv')

50008
79477
55718


In [77]:
50008+79477+55718

185203

In [78]:
df_test = pd.DataFrame([[1,2],[3,4]])

In [81]:
df_test2 = df_test

In [84]:
df_test.append(df_test2)

Unnamed: 0,0,1
0,1,2
1,3,4
0,1,2
1,3,4


In [40]:
h = pd.Series('election')

In [41]:
k=g_vectorizer(h)

In [42]:
test_result=g_nmf.transform(k)

In [43]:
len(test_result[0])

5

In [44]:
test_result

array([[0.        , 0.00819077, 0.        , 0.        , 0.01120118]])

In [33]:
test_result[0]

array([0., 0., 0., 0., 0.])

In [5]:
g_W.shape

(142570, 7)

In [16]:


def train_model(heldout_name,train_data):
    g_df = train_data 
    g_contents = g_df.content
    g_web_urls = g_df.url

    # Build our text-to-vector vectorizer, then vectorize our corpus.
    g_vectorizer, g_vocabulary = build_text_vectorizer(g_contents,
                                 use_tfidf=True,
                                 use_stemmer=False,
                                 max_features=5000)
    g_X = g_vectorizer(g_contents)

    # We'd like to see consistent results, so set the seed.
    np.random.seed(12345)

    g_rand_articles = list(range(len(g_df_full)))
    #for i in rand_articles:
    #        analyze_article(i, contents, web_urls, W, hand_labels)

    # Do it all again, this time using scikit-learn.
    g_nmf = NMF_sklearn(n_components=6, max_iter=100, random_state=12345, alpha=0.0)
    g_W = g_nmf.fit_transform(g_X)
    g_H = g_nmf.components_
    print('reconstruction error:', g_nmf.reconstruction_err_)
    g_hand_labels = hand_label_topics(g_H, g_vocabulary) # ['Garbage', 'GOP', 'Intl Politics', 'Econ', 'Dems', 'Crime', 'Election'] #
    return g_vectorizer, g_vocabulary, g_nmf
#pickle.dump(H,open('nyt_clusters_H.p','wb'))
#pickle.dump(W,open('nyt_clusters_W.p','wb'))

def read_in_raw_data():
    g_df1 = pd.read_csv('data/all-the-news/articles1.csv')
    g_df2 = pd.read_csv('data/all-the-news/articles2.csv')
    g_df3 = pd.read_csv('data/all-the-news/articles3.csv')

    g_arr1 = g_df1.values
    g_arr2 = g_df2.values
    g_arr3 = g_df3.values

    g_df_full = g_df1.append(g_df2).append(g_df3)
    g_df_reset=g_df_full.reset_index(drop=True)
    g_df_reset.head()
    g_df_reset.drop('Unnamed: 0', axis=1, inplace=True)
    g_df_full = g_df_reset
    return g_df_full
    
def get_test_train(heldout,data):
    return data[(data['publication'] != heldout)], data[(data['publication'] == heldout)]
    
def evaluate_model(vectorizer, vocabulary, nmf, heldout_outlet, test_data,outlets):
    train_data, test_data = get_test_train(heldout,data)
    
    
#maybe remove wapo
def kfolds():
    #outlets =[('Fox News',1),('National Review',1),('National Review',1),('New York Post',1),('Breitbart',1),
    #          ('Buzzfeed News',0),('Vox',0)('Atlantic',0),('Washington Post',0),('CNN',0)]
    outlets =[('Fox News',('National Review',1),('National Review',1),('New York Post',1),('Breitbart',1),
              'Buzzfeed News','Vox','Atlantic','Washington Post','CNN']
    leanings_dict = {'Fox News':1,'National Review':1,'National Review':1,'New York Post':1,'Breitbart':1,
              'Buzzfeed News':0,'Vox':0,'Atlantic':0,'Washington Post':0,'CNN':0}
    
    data = read_in_raw_data()
    
    for outlet in heldout_outlets:
        #tmp = [train_outlet for train_outlets in outlets if train_outlet != outlet]
        train_model(heldout_name,train_data):
        vectorizer, vocabulary, nmf = train_model(outlet,train_data)
        
        
        
    

In [6]:
g_df_full['publication'].value_counts()

Breitbart              23781
New York Post          17493
NPR                    11992
CNN                    11488
Washington Post        11114
Reuters                10710
Guardian                8681
New York Times          7803
Atlantic                7179
Business Insider        6757
National Review         6203
Talking Points Memo     5214
Vox                     4947
Buzzfeed News           4854
Fox News                4354
Name: publication, dtype: int64

In [None]:
g_df_full[(g_df_full['publication'] != heldout)]

In [14]:
(g_df_full['publication'] != 'Buzzfeed News').value_counts()

True     137716
False      4854
Name: publication, dtype: int64