## Imports

In [1]:
import nltk
import Liu

In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer # For sentiment analysis
import pickle # For loaded dataset from pickle file
import tqdm # Progress bar
from collections import Counter # Handy addon
from pprint import pprint # Useful to print JSON objects
import numpy as np

In [3]:
# This loads the file that you want, might take several seconds (up to a minute)

with open("news_sentiment.pickle", "rb") as f:
    articles = pickle.load(f, encoding='bytes')
print(len(articles), "articles were loaded")
print("Example article:")
pprint(articles[1040])


57767 articles were loaded
Example article:
{b'news_topic': b'ISIS War',
 'introductions': [{'person': 'Bashar al-Assad',
                    'text': 'President',
                    'wdid': 'Q44329'},
                   {'person': 'Emile Hokayem', 'text': 'in Foreign Policy'},
                   {'person': 'Ahrar al Sham',
                    'text': 'the most important groups',
                    'wdid': 'Q860943'},
                   {'person': 'Vladimir Putin',
                    'text': 'Russian President',
                    'wdid': 'Q7747'},
                   {'person': 'Barack Obama',
                    'text': 'U.S. President',
                    'wdid': 'Q76'},
                   {'person': 'Osama Abu Zeid',
                    'text': 'a senior adviser to the moderate Free Syrian '
                            'Army'},
                   {'person': 'Op-Ed',
                    'text': 'for The Washington Post',
                    'wdid': 'Q2602337'},
                  

In [4]:
# separate articles from the two stories
ISIS_articles = []
Brexit_articles = []
for a in articles:
    if a[b"news_topic"] == b'ISIS War':
        ISIS_articles.append(a)
    else:
        Brexit_articles.append(a)
        
print(len(ISIS_articles), " articles from ISIS War and ", len(Brexit_articles), "articles from Brexit were loaded")

39206  articles from ISIS War and  18561 articles from Brexit were loaded


In [5]:
# get only articles from one story, you can change this
articles = ISIS_articles

## Extract introductions, and obtain their sentiment

In [6]:
analyzer = SentimentIntensityAnalyzer()

total_introductions = []
for a in articles:
    for intro in a.get('introductions', []):
        intro['source'] = a['source']
        total_introductions.append(intro)

for intro in tqdm.tqdm_notebook(total_introductions):
    intro['sentiment'] = Liu.demo_liu_hu_lexicon(intro['text'])

A Jupyter Widget




In [16]:
len(total_introductions)

214880

In [22]:
import pickle

with open('introductions_liu', 'wb') as fp:
    pickle.dump(total_introductions, fp)

In [7]:
# Example some sentiment for some of the introductions

subsample = np.random.choice(total_introductions, 100)
for intro in subsample:

    if intro['sentiment'] != 0:
        print("---------------")
        print("Entity mentionned:", intro['person'])
        print(intro['text'])
        print("Sentiment:", intro['sentiment'])

---------------
Entity mentionned: Adolf Hitler
World War II German dictator
Sentiment: -1
---------------
Entity mentionned: Mazraat al-Qubair
which would be the fourth such mass killing of civilians in Syria in the last two weeks
Sentiment: -1
---------------
Entity mentionned: Fethullah Gulen
his arch enemy
Sentiment: -1
---------------
Entity mentionned: Marie Colvin
fearless , committed , essential
Sentiment: 1
---------------
Entity mentionned: Jassim al-Assadi
who lost two friends in the bombing
Sentiment: -1
---------------
Entity mentionned: Jay Garner
former Assistant Vice Chief of Staff of the Army
Sentiment: -1
---------------
Entity mentionned: Kerry
who like Hagel
Sentiment: 1
---------------
Entity mentionned: Anthony Shadid
Less than a week ago , a New York Times correspondent , , , apparently from an asthma attack , while on a clandestine trip inside northern Syria .
Sentiment: -1


## Build a 2-dimensional object containing sentiment per entity, per source

In [8]:
ent_source_sent = {}

for intro in total_introductions:
    p = intro['person']
    s = intro['source']
    if p not in ent_source_sent:
        ent_source_sent[p] = {}
    if s not in ent_source_sent[p]:
        ent_source_sent[p][s] = []
    ent_source_sent[p][s].append(intro['sentiment'])

In [9]:
# An example of how one entity (a city) is described by different sources

print(ent_source_sent)
print(np.mean(ent_source_sent['Macron']['nytimes.com']).astype(int))

0


In [10]:
# We get rid of entities that don't contain enough data

entities_kept = []

for entity in ent_source_sent.keys():
    sentiments = ent_source_sent[entity]
    total_size = sum([len(sentiments[source]) for source in sentiments.keys()])
    if total_size >= 3:
        entities_kept.append(entity)
        
print("We will keep a total of", len(entities_kept), " / ", len(ent_source_sent.keys()) ,"in our dataset")


sources = set([])
for entity in entities_kept:
    sources|= set(ent_source_sent[entity].keys())
sources = list(sources)

print("We have ", len(sources), "sources: ", sources)

We will keep a total of 7852  /  25128 in our dataset
We have  22 sources:  ['aljazeera.com', 'cnn.com', 'rt.com', 'washingtonpost.com', 'allafrica.com', 'latimes.com', 'middleeasteye.net', 'bbc.co.uk', 'theguardian.com', 'france24.com', 'nytimes.com', 'foxnews.com', 'bloomberg.com', 'aa.com.tr', 'telegraph.co.uk', 'reuters.com', 'chinadaily.com.cn', 'wikinews.org', 'independent.co.uk', 'techcrunch.com', 'businessinsider.in', 'ap.org']


## We create the array we will use in our sparse model

In [11]:
# Parameters: changing these affects the results you get
Pos_neg_ratio = 2.0
overall_ratio = 0.15
pos_threshold = 0.15
neg_threshold = -0.15

N = len(entities_kept)
M = len(sources)
A = np.zeros((N, M))

sentiment_counts = Counter()

source2j = {source: j for j, source in enumerate(sources)}

for i, entity in enumerate(entities_kept):
    for source in ent_source_sent[entity].keys():
        sent_array = np.array(ent_source_sent[entity][source])
        N_pos = float(len(np.where(sent_array > pos_threshold)[0]))
        N_neg = float(len(np.where(sent_array < neg_threshold)[0]))
        T = float(len(sent_array))
        aggregate_sentiment = 0
        if N_pos > Pos_neg_ratio*N_neg and N_pos > overall_ratio*T:
            aggregate_sentiment = 1
        elif N_neg > Pos_neg_ratio*N_pos and N_neg > overall_ratio*T:
            aggregate_sentiment = -1
        j = source2j[source]
        
        A[i,j] = np.mean(ent_source_sent[entity][source]).astype(int)
        
        sentiment_counts[aggregate_sentiment] += 1

print ("We allocated some sentiment in this matrix, the repartition is:", sentiment_counts)

We allocated some sentiment in this matrix, the repartition is: Counter({0: 20154, -1: 2844, 1: 2383})


## Model source similarity

In [12]:
# Write code that uses this matrix (entities, sources) to compute
# source similarity visible in bias of the way they describe entities

In [13]:
from sklearn.preprocessing import StandardScaler
A_std = StandardScaler().fit_transform(A)

In [14]:
from sklearn.covariance import GraphLasso
graph_lasso = GraphLasso(alpha=0.12)
graph_lasso.fit(A_std)
np.mean(graph_lasso.get_precision() > 0)

for (i, j) in zip(*np.where(abs(graph_lasso.get_precision()) > 0)):
    if i > j:
        print(sources[i], sources[j])

  - np.dot(covariance_[indices != idx, idx], coefs)))
  * coefs)
  * coefs)
  gap = np.sum(emp_cov * precision_)
  - np.abs(np.diag(precision_)).sum())
  log_likelihood_ = - np.sum(emp_cov * precision) + fast_logdet(precision)
  sign, logdet = _umath_linalg.slogdet(a, signature=signature)
  - np.abs(np.diag(precision_)).sum())
  if np.abs(d_gap) < tol:


FloatingPointError: Non SPD result: the system is too ill-conditioned for this solver. The system is too ill-conditioned for this solver

## Workspace

In [None]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [None]:
from Visualizer import plot_network 
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,10)

In [None]:
for i in np.arange(0.1,0.2, 0.01):
    l1_lambda = i
    print(l1_lambda)
    graph_lasso = GraphLasso(alpha=l1_lambda)
    graph_lasso.fit(A_std)
    plot_network(graph_lasso.get_precision(), sources)

In [None]:
from GraphicalLasso import GraphicalLasso as GL
from GramLassoSolver import GramLassoSolver as GramLasso
from sklearn import linear_model
def my_graphical_lasso(data, l1_lambda):
    gl = GL(l1_solver_f=GramLasso)
    S = np.cov(data.T)
    return gl.fit(S, l1_lambda)

## Our Glasso: Lambda sweeping

In [None]:
for i in np.arange(0.1,0.2, 0.01):
    l1_lambda = i
    print(l1_lambda)
    plot_network(my_graphical_lasso(A_std, l1_lambda), sources)

## Our Glasso: best by inspection

In [None]:
plot_network(my_graphical_lasso(A_std, l1_lambda=0.12), sources)

## Neighborhood method

In [None]:
from NeighborhoodGraphSelection import NGraphSelection
from sklearn.linear_model import Lasso

In [None]:
n_sel = NGraphSelection(Lasso)
g = n_sel.fit(A_std, 0.1, 'AND')

In [None]:
plot_network(g, sources)

## Neighborhood method w/own Lasso implementation (not correct)

In [None]:
from Lasso import Lasso2
n_sel = NGraphSelection(Lasso2)
g = n_sel.fit(A_std, 0.1, 'AND')

In [None]:
plot_network(g, sources)