In [4]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from sklearn.cross_validation import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF as NMF_sklearn
import pickle
from nmf import NMF
import pymongo
from pymongo import MongoClient
import pickle
def build_text_vectorizer(contents, use_tfidf=True, use_stemmer=False, max_features=None):
    '''
    Build and return a **callable** for transforming text documents to vectors,
    as well as a vocabulary to map document-vector indices to words from the
    corpus. The vectorizer will be trained from the text documents in the
    `contents` argument. If `use_tfidf` is True, then the vectorizer will use
    the Tf-Idf algorithm, otherwise a Bag-of-Words vectorizer will be used.
    The text will be tokenized by words, and each word will be stemmed iff
    `use_stemmer` is True. If `max_features` is not None, then the vocabulary
    will be limited to the `max_features` most common words in the corpus.
    '''
    
    Vectorizer = TfidfVectorizer if use_tfidf else CountVectorizer
    tokenizer = RegexpTokenizer(r"[\w']+")
    stem = PorterStemmer().stem if use_stemmer else (lambda x: x)
    stop_set = set(stopwords.words('english'))

    # Closure over the tokenizer et al.
    def tokenize(text):
        tokens = tokenizer.tokenize(text)
        stems = [stem(token) for token in tokens if token not in stop_set]
        return stems

    vectorizer_model = Vectorizer(tokenizer=tokenize, max_features=max_features)
    vectorizer_model.fit(contents)
    vocabulary = np.array(vectorizer_model.get_feature_names())

    # Closure over the vectorizer_model's transform method.
    def vectorizer(X):
        return vectorizer_model.transform(X).toarray()

    return vectorizer, vocabulary


def softmax(v, temperature=1.0):
    '''
    A heuristic to convert arbitrary positive values into probabilities.
    See: https://en.wikipedia.org/wiki/Softmax_function
    '''
    expv = np.exp(v / temperature)
    s = np.sum(expv)
    return expv / s


def hand_label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    hand_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:20]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_five]))
        label = input('please label this topic: ')
        hand_labels.append(label)
        print()
    return hand_labels


def analyze_article(article_index, contents, web_urls, W, hand_labels):
    '''
    Print an analysis of a single NYT articles, including the article text
    and a summary of which topics it represents. The topics are identified
    via the hand-labels which were assigned by the user.
    '''
    #print(web_urls[article_index])
    #print(contents[article_index])
    probs = softmax(W[article_index], temperature=0.01)
    
    top_prob = 0
    top_cat = 0
    i = 0
    for prob, label in zip(probs, hand_labels):
        if prob > top_prob:
            top_prob = prob
            top_cat = i
        #print('--> {:.2f}% {}'.format(prob * 100, label))
        i = i + 1
    return top_cat, probs
    #print()
    #    gotta assign all of these to the correct bin and then tfidf them 


In [9]:


def build_nmf(heldout_name,train_data):
    #make_names:
    vectorizer_file_name = 'cache/'+heldout_name+' vectorizer'
    vocabulary_file_name = 'cache/'+heldout_name+' vocabulary'
    nmf_file_name = 'cache/'+heldout_name+' nmf'
    
    g_df = train_data 
    g_contents = g_df.content
    g_web_urls = g_df.url

    # Build our text-to-vector vectorizer, then vectorize our corpus.
    g_vectorizer, g_vocabulary = build_text_vectorizer(g_contents,
                                 use_tfidf=True,
                                 use_stemmer=False,
                                 max_features=5000)
    g_X = g_vectorizer(g_contents)

    # We'd like to see consistent results, so set the seed.
    np.random.seed(12345)

    #g_rand_articles = list(range(len(g_df_full)))
    #for i in rand_articles:
    #        analyze_article(i, contents, web_urls, W, hand_labels)

    # Do it all again, this time using scikit-learn.
    g_nmf = NMF_sklearn(n_components=6, max_iter=100, random_state=12345, alpha=0.0)
    g_W = g_nmf.fit_transform(g_X)
    g_H = g_nmf.components_
    print('reconstruction error:', g_nmf.reconstruction_err_)
    #g_hand_labels = ['label_one', 'label_two', 'label_three', 'label_four', 'label_five', 'label_one'] # hand_label_topics(g_H, g_vocabulary) #
    
    #pickle.dump(g_vectorizer,open(vectorizer_file_name,'wb'))
    #pickle.dump(g_vocabulary,open(vocabulary_file_name,'wb'))
    #pickle.dump(g_nmf,open(nmf_file_name,'wb'))
    
    return g_vectorizer, g_vocabulary, g_nmf
#pickle.dump(H,open('nyt_clusters_H.p','wb'))
#pickle.dump(W,open('nyt_clusters_W.p','wb'))

def train_random_forest(cleaned_nmf_compenents, raw_training_data, cleaned_train_y,trees_in_forest = 100):
    print('trees in this forest: ', trees_in_forest)
    #outlets = raw_training_data['publication']
    #y_train = pd.Series([outlet_leaning[outlet] for outlet in outlets])
    y_train = cleaned_train_y
    X_train = cleaned_nmf_compenents
    rf = RandomForestClassifier(n_estimators=trees_in_forest, oob_score=True)
    rf.fit(X_train, y_train)
    #print("\n11: accuracy score:", rf.score(X_test, y_test))
    #print("    out of bag score:", rf.oob_score_)
    return rf

def read_in_raw_data():
    g_df1 = pd.read_csv('data/all-the-news/articles1.csv')
    g_df2 = pd.read_csv('data/all-the-news/articles2.csv')
    g_df3 = pd.read_csv('data/all-the-news/articles3.csv')

    g_arr1 = g_df1.values
    g_arr2 = g_df2.values
    g_arr3 = g_df3.values

    g_df_full = g_df1.append(g_df2).append(g_df3)
    g_df_reset=g_df_full.reset_index(drop=True)
    g_df_reset.head()
    g_df_reset.drop('Unnamed: 0', axis=1, inplace=True)
    g_df_full = g_df_reset
    return g_df_full
    
def get_test_train(heldout,data):
    return data[(data['publication'] != heldout)], data[(data['publication'] == heldout)]
    
def predictions():
    pass

    
def evaluate_model(vectorizer, vocabulary, nmf, heldout_outlet, test_data,outlets):
    train_data, test_data = get_test_train(heldout,data)
    
    
#maybe remove wapo
def kfolds():
    #outlets =[('Fox News',1),('National Review',1),('National Review',1),('New York Post',1),('Breitbart',1),
    #          ('Buzzfeed News',0),('Vox',0)('Atlantic',0),('Washington Post',0),('CNN',0)]
    outlets =['Fox News','National Review','National Review','New York Post','Breitbart',
              'Buzzfeed News','Vox','Atlantic','Washington Post','CNN']
    leanings_dict = {'Fox News':1,'National Review':1,'National Review':1,'New York Post':1,'Breitbart':1,
              'Buzzfeed News':0,'Vox':0,'Atlantic':0,'Washington Post':0,'CNN':0}
    
    data = read_in_raw_data()
    
    
    for heldout_outlet in outlets:
        #tmp = [train_outlet for train_outlets in outlets if train_outlet != outlet]
        train_data, test_data = get_test_train(heldout_outlet,data)
        #model specific below
        vectorizer, vocabulary, nmf = build_nmf(heldout_outlet,train_data)
        
        
        
def remove_weak_outlets(valid_outlets,leanings,train_set,labels):
    assert(len(train_set)==len(labels))
    joined_data = zip(labels,train_set)
    cleaned_data = []
    cleaned_label = []
    cleaned_outlet = []
    #= pd.DataFrame([row[1] for row in joined_data if row[0] in valid_outlets])
    for row in joined_data:
        if row[0] in valid_outlets:
            cleaned_data.append(row[1])
            cleaned_outlet.append(row[0])
            cleaned_label.append(leanings[row[0]])
            #print('row0: ',row[0])
            #print('leanings[row[0]]: ',leanings[row[0]])
    return pd.DataFrame(cleaned_data), pd.Series(cleaned_outlet), pd.Series(cleaned_label)
    #for label in labels:
       # if label in valid_outlets

        
def test():
    outlets =['Fox News','National Review','New York Post','Breitbart',
              'Buzzfeed News','Vox','Atlantic','Washington Post','CNN']
    leanings_dict = {'Fox News':1,'National Review':1,'New York Post':1,'Breitbart':1,
              'Buzzfeed News':0,'Vox':0,'Atlantic':0,'Washington Post':0,'CNN':0}

    data = read_in_raw_data()
    for heldout_outlet in outlets: 
        #heldout_outlet = 'Atlantic'
        #tmp = [train_outlet for train_outlets in outlets if train_outlet != outlet]
        train_data, test_data = get_test_train(heldout_outlet,data)
        #model specific below
        vectorizer, vocabulary, nmf = build_nmf(heldout_outlet,train_data)
        vectorized_train_data = vectorizer(train_data['content'])
        components = nmf.transform(vectorized_train_data)

        vectorized_train_data = vectorizer(train_data['content'])
        components = nmf.transform(vectorized_train_data)
        cleaned_train_data, cleaned_train_df, cleaned_train_y = remove_weak_outlets(outlets,leanings_dict,components,train_data['publication'])

        test_data=test_data.reset_index(drop=True)
        test_X = nmf.transform(vectorizer(test_data['content'])) #pd.Series([vectorizer(article) for article in test_data['content']])

        pub_type = leanings_dict[test_data['publication'][0]]
        test_y = pd.Series([pub_type for i in range(0,len(test_data))])
        print('Score for: ', heldout_outlet)
        print(rand_forest.score(test_X, test_y))
    
def test_grid_search():
    trees = [100]
    outlets =['Fox News','National Review','New York Post','Breitbart',
                  'Buzzfeed News','Vox','Atlantic','Washington Post','CNN']
    leanings_dict = {'Fox News':1,'National Review':1,'New York Post':1,'Breitbart':1,
            'Buzzfeed News':0,'Vox':0,'Atlantic':0,'Washington Post':0,'CNN':0}

    data = read_in_raw_data()
    for heldout_outlet in outlets: 
        #heldout_outlet = 'Atlantic'
        #tmp = [train_outlet for train_outlets in outlets if train_outlet != outlet]
        train_data, test_data = get_test_train(heldout_outlet,data)
        #model specific below
        vectorizer, vocabulary, nmf = build_nmf(heldout_outlet,train_data)
        vectorized_train_data = vectorizer(train_data['content'])
        components = nmf.transform(vectorized_train_data)

        #vectorized_train_data = vectorizer(train_data['content'])
        #components = nmf.transform(vectorized_train_data)
        cleaned_train_data, cleaned_train_df, cleaned_train_y = remove_weak_outlets(outlets,leanings_dict,components,train_data['publication'])

        test_data=test_data.reset_index(drop=True)
        test_X = nmf.transform(vectorizer(test_data['content'])) #pd.Series([vectorizer(article) for article in test_data['content']])

        pub_type = leanings_dict[test_data['publication'][0]]
        test_y = pd.Series([not pub_type for i in range(0,len(test_data))])
        
        for tree_count in trees:
            rand_forest = train_random_forest(cleaned_train_data, _, cleaned_train_y, trees_in_forest = tree_count)
            print('Score for: ', heldout_outlet, 'with ',tree_count, ' trees')

            print(rand_forest.score(test_X, test_y))
    
    


