In [166]:
import collections
import json
import pathlib
from pprint import pprint
import string

import gensim as gs
import numpy as np
import pandas as pd

from nltk import tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def expand_contractions(text):
    pass
    # *can't --> cannot
    # shouldn't, hasn't, wouldn't --> (...)n't --> (...) not
    # I'd, he'd, we'd --> (...)'d --> (...) would
    # should've --> (...)'ve --> (...) have

In [3]:
def summarize(text, top, *, vectorizer=TfidfVectorizer, stop_words=None, metric=cosine_similarity):
    sentences = tokenize.sent_tokenize(text)
    corpus = sentences + [text]
    vector = vectorizer(stop_words=stop_words).fit_transform(corpus)
    
    sent_slice = slice(0, -1)
    text_index = -1
    measures = metric(vector[sent_slice], vector[text_index])
    meas_sent = zip(measures, sentences)
    top = sorted(meas_sent, reverse=True)[:top]
    top_sents = [s for _, s in top]
    return top_sents

In [18]:
def id2token(dctnry, bow):
    id2token = {v: k for k, v in dctnry.token2id.items()}
    return [(id2token[id], token) for id, token in bow]

In [4]:
year = str(2010)
jsons = pathlib.Path.cwd() / 'data' / 'jsons'
json_year = jsons / year / f'{year}.json'

with open(json_year) as fp:
    dict_year = json.load(fp)[year]
    
article = dict_year[0]

In [5]:
story = article['story']
summary = article['summary'].strip()
summary

"Life begins to return to normal in Kyrgyzstan after two days of violent protests led the country's president to flee the capital, Bishkek. But President Kurmanbek Bakiyev has not resigned, and the self-declared interim government warns there could be more violence."

In [187]:
text = story
tokenizer = tokenize.word_tokenize
stop_words = set(stopwords.words('english'))
# stop_words.update(string.punctuation)

def seive(word):
    return (word not in stop_words) and (len(word) > 2) and (not word.startswith("'"))

summary_sentences = tokenize.sent_tokenize(summary.lower())
summary_tokens = [[word for word in tokenizer(sent) if seive(word)] 
                  for sent in summary_sentences]

story_paragraphs = story.lower().split('\n\n')
story_tokens = [[word for word in tokenizer(para) if seive(word)]
                for para in story_paragraphs]

####: for some reason the tfidf only really works with a corpus of more than 1 item
# story_paragraphs = story.lower().replace('\n\n', ' ')
# story_tokens = [[word for word in tokenizer(story_paragraphs) if word not in stop_words]]

story_dctnry = gs.corpora.Dictionary(story_tokens)
corpus = [story_dctnry.doc2bow(tkn) for tkn in story_tokens]

tfidf = gs.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

summary_bows = [story_dctnry.doc2bow(doc) for doc in summary_tokens]
summary_tfidf = tfidf[summary_bows]

In [188]:
lsi = gs.models.LsiModel(corpus_tfidf, id2word=story_dctnry, num_topics=2)  # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]

In [189]:
lsi.print_topics()

[(0,
  '0.236*"said" + 0.213*"feel" + 0.187*"trying" + 0.165*"bishkek" + 0.165*"beginning" + 0.150*"kerimbayeva" + 0.149*"president" + 0.149*"saryganbayev" + 0.146*"worst" + 0.146*"optimistic"'),
 (1,
  '-0.285*"feel" + 0.222*"among" + 0.222*"friday" + 0.222*"main" + 0.220*"square" + -0.203*"trying" + -0.196*"said" + -0.196*"confident" + -0.196*"calm" + -0.179*"optimistic"')]

In [175]:
id2token(story_dctnry, summary_bows[0])

[('bishkek', 1),
 ('capital', 1),
 ('country', 1),
 ('days', 1),
 ('flee', 1),
 ('kyrgyzstan', 1),
 ('led', 1),
 ('life', 1),
 ('normal', 1),
 ('president', 1),
 ('protests', 1),
 ('return', 1),
 ('two', 1),
 ('violent', 1)]

In [176]:
id2token(story_dctnry, summary_bows[1])

[('president', 1),
 ('bakiyev', 1),
 ('could', 1),
 ('government', 1),
 ('interim', 1),
 ('kurmanbek', 1),
 ('self-declared', 1),
 ('violence', 1),
 ('warns', 1)]