### idea is:

### given:
story ID (AOL)

### return: 
clusters (in format we discussed)


In [1]:
import spacy
parser = spacy.load('en')

In [2]:
import requests
import json

def return_articles(story_id):
    """
        for now don't use this, api isn't always working. use pickled object instead
        stories api (returns 100 articles)
    """
    url = "api.aol..."
    articles = requests.get(url)
    articles = articles.json()
    return articles["articles"]


In [3]:
#temporarily just return pickled dump that we have been using
import pickle

def return_articles_temp():
    """
        load pickle temporarily
    """
    articles = pickle.load(open("gun_control.p", "rb"))
    return articles["articles"]


In [4]:
def delete_repeating_titles(articles):
    """
        delete articles that have the same title
        could we make this into an article feature?
    """
    non_repeating = []
    titles = set()
    for art in articles:
        title = art["title"]
        if title not in titles:
            non_repeating.append(art)
            titles.add(title)
    return non_repeating

In [5]:
def parse_articles(articles, parser):
    """
        parse article content using spacy, add field in article dict called "parsed_content"
        articles: list of article jsons from aol api
        parser: spacy.load('en')
    """
    idx_count = 0
    for art in articles:
        articles[idx_count]["parsed_content"] = parser(art["content"])
        idx_count += 1
    return articles

In [6]:
def create_sentence_objects(articles, story_id):
    """
        given articles, return list of sentence objects
    """
    sent_objects = []
    sent_count = 0
    for art in articles:
        beg_sent = sent_count
        art["beg_sent"] = beg_sent
        parsed_article = art['parsed_content']
        article_id = art["id"]
        for sent in parsed_article.sents:
            sent_obj = {}
            sent_obj["spacy_sent"] = sent
            sent_obj["sent_id"] = sent_count
            sent_obj["article_id"] = article_id
            sent_obj["story_id"] = story_id
            sent_obj["feature_vector"] = None
            sent_obj["cluster_num"] = None
            sent_objects.append(sent_obj)
            sent_count += 1
        art["end_sent"] = sent_count - 1
    return sent_objects
    

In [37]:
#sentence level features

# https://stackoverflow.com/questions/12488722/counting-bigrams-pair-of-two-words-in-a-file-using-python

from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from collections import Counter
import textacy
import textacy_keyterms as keyterms

import argparse
import httplib2
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials


def get_google_service():
    """
        to initialize googlecredentials
    """
    credentials = GoogleCredentials.get_application_default()
    scoped_credentials = credentials.create_scoped(
        ['https://www.googleapis.com/auth/cloud-platform'])
    http = httplib2.Http()
    scoped_credentials.authorize(http)
    return discovery.build('language', 'v1beta1', http=http)


def get_sentiment(sent):
    """
        use google cloud nlp to get sentiment
    """
    body = {'document': {
                'type': 'PLAIN_TEXT',
                'content': sent.orth_
                }
            }
    service = get_google_service()
    request = service.documents().analyzeSentiment(body=body)
    response = request.execute()
    return [response["documentSentiment"]["magnitude"], response["documentSentiment"]["polarity"]] 

def tokenize_sent(sentence, extra_stop):
    """
        tokenize spacy sent for bow features
    """
    cleaned_sent = []
    for tok in sentence["spacy_sent"]:
        if not tok.is_punct and not tok.is_stop and tok.orth_ not in extra_stop and\
           not tok.is_space:
            cleaned_sent.append(tok.lemma_)
    return cleaned_sent

def get_all_keywords(articles):
    """
        return a set of keywords in all articles
        -get keywords one articlea at a time-
    """
    keywords = set()
    for art in articles:
        keywords.update(get_article_keywords(art["parsed_content"]))        
    return keywords


def get_article_keywords(article):
    """
        sgrank alg. to get keywords from article
    """
    ranked_results = keyterms.sgrank(article)
    return [tok[0].lower() for tok in ranked_results]


def make_keyword_dict(sent, keywords):
    """
        lemmatize the sentence so keywords that have changed tense can be found
        add keywords to sentence object
    """
    kwd_dict = {}
    sent["keywords"] = {}
    for kwd in keywords:
        if kwd in ' '.join([tok.lemma_ for tok in sent["spacy_sent"]]).lower():
            if kwd not in kwd_dict:
                kwd_dict[kwd] = 1
                sent["keywords"][kwd] = 1
            else:
                kwd_dict[kwd] += 1
                sent["keywords"][kwd] += 1
    return kwd_dict
        
    
def add_sentence_level_features(sentence_objects, articles):
    """
        add feature vector to sentence object. features:
        - bow
        - keywords
    """
    extra_stop = ["n't", "'s", "'m"]
    vect_bow = DictVectorizer()
    
    #BOW features
    X_bow = vect_bow.fit_transform(Counter(tokenize_sent(sent, extra_stop)) for sent in sentence_objects)
    
    #keyword features
    vect_keywords = DictVectorizer()
    keywords = get_all_keywords(articles)
    X_keyword = vect_keywords.fit_transform(make_keyword_dict(sent, keywords) for sent in sentence_objects)
    
    #sentence sentiment feature (<magnitude>, <polarity>)
    X_sentiment = [get_sentiment(sent["spacy_sent"]) for sent in sentence_objects]
    
    #concatenate
    vects = hstack([X_bow, X_keyword, X_sentiment])
    
    #add vector to sentence_object
    num = 0
    for vect in vects.toarray():
        sentence_objects[num]["feature_vector"] = vect
        num += 1
    return vects
    
  

In [38]:
#cluster sentence objects after dimensionality reduction
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from collections import defaultdict


def dimensionality_reduction(X, r):
    """
        reduce dimensions using SVD
    """
    U, s, V = np.linalg.svd(X.toarray())
    return rank_r_approx(U, s, r)

def rank_r_approx(U, s, r):
    """
        return dimensionally reduced X, with rank r
    """
    sigma = np.zeros((r, r), float)
    np.fill_diagonal(sigma, s[:r])
    return np.dot(U[:, :r], sigma)

def merge_kwd_counts(keywords):
    """
        given list of dicts with keywords, merges them into one dict
    """
    kwds_aggregation = defaultdict(list)
    for d in keywords:
        for key, value in d.iteritems():
            if key in kwds_aggregation:
                kwds_aggregation[key] += value
            else:
                kwds_aggregation[key] = value
    return kwds_aggregation

def cluster_sentence_vectors(sentence_objects, X, r=100, reduce_dim=True, N_CLUSTERS=5):
    """
        given vector results and number of clusters return cluster objects
        reduce_dim is True on default
    """
    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=0)
        
    if reduce_dim:
        X_reduced = dimensionality_reduction(X, r)
        cluster_assignments = kmeans.fit_predict(X_reduced)
        centroids = kmeans.cluster_centers_
        cluster_dict = {x : {"vector": centroids[x], "sentences": [], "reduced": True}\
                             for x in xrange(len(centroids))}
    else:
        cluster_assignments = kmeans.fit_predict(X)
        centroids = kmeans.cluster_centers_
        cluster_dict = {x : {"vector": centroids[x], "sentences": [], "reduced": False}\
                             for x in xrange(len(centroids))}
 
    temp_cluster_keywords = {x: {} for x in xrange(len(centroids))}
    count = 0
    for sent in sentence_objects:
        cluster_num = cluster_assignments[count]
        sent["cluster_num"] = cluster_num
        if reduce_dim:
            reduced_vect = X_reduced[count]
            sent["reduced_feature_vector"] = reduced_vect
            dist_to_centroid = euclidean_distances(centroids[cluster_num], sent["reduced_feature_vector"])[0][0]
        else:
            dist_to_centroid = euclidean_distances(centroids[cluster_num], sent["feature_vector"])[0][0]
            
        sent["dist_to_centroid"] = dist_to_centroid
        #add to cluster object
        cluster_dict[cluster_num]["sentences"].append({sent["sent_id"]: dist_to_centroid })
        # merge keyword dictionaries together
        temp_cluster_keywords[cluster_num] = merge_kwd_counts([temp_cluster_keywords[cluster_num], sent["keywords"]]) 
        count += 1
    
    NUM_KEYWORDS = 10
    for cluster_num in temp_cluster_keywords:
        clustered = sorted(temp_cluster_keywords[cluster_num].items(), key=lambda x: x[1])[0:NUM_KEYWORDS]
        cluster_dict[cluster_num]["keywords"] = [x[0] for x in clustered]
        
    return cluster_dict

#### testing things
- try to do everything using spacy

In [39]:
articles = return_articles_temp()
articles = delete_repeating_titles(articles)
articles = parse_articles(articles, parser)

In [40]:
#create sentence objects
sentence_objects = create_sentence_objects(articles, "###")

In [44]:
X_temp = add_sentence_level_features(sentence_objects[0:10], articles)

In [None]:
X = add_sentence_level_features(sentence_objects, articles)

In [48]:
X_temp.toarray()

array([[ 0. ,  0. ,  0. , ...,  1. ,  0. ,  1. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0.8,  1. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0.2, -1. ],
       ..., 
       [ 0. ,  0. ,  0. , ...,  0. ,  0.8, -1. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0.7, -1. ],
       [ 0. ,  1. ,  0. , ...,  0. ,  0. ,  1. ]])

In [65]:
cluster_dict = cluster_sentence_vectors(sentence_objects, X)



In [69]:
for num in cluster_dict:
    print cluster_dict[num]["keywords"]

[u'illegal alien', u'shooting tragedy', u'umbrella body', u'legal authority', u'specific threshold', u'private citizen', u'vice president joe', u'bold agenda', u'recent year available', u'key point']
[u'licensed dealer', u'remaining loophole', u'online sale forum', u'brief break', u'presidential candidate', u'thing their own rhetoric', u'gun paraphernalia', u'executive action fact', u'everybody \u2019s gun', u'legal authority']
[u'office', u'certain social security administration data', u'existing law', u'safety law', u'mental health funding\u2013all', u'republican congressional colleague', u'mental health program', u'market \u2019', u'private handgun sale', u'amendment right']
[u'shop', u'licensed dealer', u'major executive', u'general loretta', u'focus', u'washington examiner', u'everybody \u2019s gun', u'carolina \u2019s delegate', u'commonsense step', u'\u2019s new proposal']
[u'shop', u'licensed dealer', u'\u2019s speech', u'tough gun measure', u'dangerous level', u'new federal', 

In [84]:
sentence_objects[10]["spacy_sent"].orth_

u'And Obama is directing federal agencies to research smart gun technology to reduce accidental shootings and asking Congress for $500 million for mental health care.'

In [72]:
#keywords
# cluster_dict[0].keys()

In [71]:
# cluster_dict[0]["sentences"]

In [None]:
#function to average articles within sentence

In [70]:
# len(cluster_dict[4]["sentences"])

#### other/alternative things

In [None]:
#the clustering takes forever
from sklearn.cluster import AgglomerativeClustering
N_CLUSTERS = 5
model = AgglomerativeClustering(n_clusters=N_CLUSTERS,
                                linkage="average", affinity="cosine")
model.fit(X.toarray())

In [None]:
#if you don't like spacy sent detector, you can always use this
import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentences_nltk = sent_detector.tokenize(articles[0]["content"].strip())

In [None]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

#to strip unicode of punctuation:
import unicodedata
import sys
tbl = {}
for i in xrange(sys.maxunicode):
    char = unichr(i)
    if unicodedata.category(char).startswith('P'):
        tbl[i] = None

In [None]:
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')

def clean_text(sent, stop_words=True):
    """
    takes: sentence
    returns: list of words
        removes punctuation from unicode sentence, lowercase and tokenizes.
        #     article_text = article_text.encode('ascii', errors = 'ignore')
    """
    sent = sent.lower()
    tokenized = []
    for word in sent.split():
        word = word.translate(tbl)
        if wnl.lemmatize(word):
            if stop_words:
                if word not in nltk_stopwords:
                    tokenized.append(wnl.lemmatize(word))   
    return tokenized