In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import os
import matplotlib.pyplot as plt
%matplotlib notebook
import seaborn as sns
sns.set_context("notebook")
import plotly.plotly as py
from scipy.stats import norm
from bokeh.plotting import figure, show
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.stats import norm
from scipy.cluster.hierarchy import dendrogram

DATA_DIR = 'data'
MIN_CONSIDER = 2000  # minimum number of datapoints to consider the data
MIN_CI = 10000  # minimum number of datapoints under which we assume the normal mean assumption holds

In [None]:
country_to_country_view = pd.read_csv(os.path.join(DATA_DIR, 'country_to_country_view.csv')).dropna()  # 3 rows have no mention_country
country_inner_view = pd.read_csv(os.path.join(DATA_DIR, 'country_inner_view.csv'))
country_outer_view = pd.read_csv(os.path.join(DATA_DIR, 'country_outer_view.csv'))
country_inner_type_view = pd.read_csv(os.path.join(DATA_DIR, 'country_inner_type_view.csv'))
country_outer_type_view = pd.read_csv(os.path.join(DATA_DIR, 'country_outer_type_view.csv'))
country_to_type_view = pd.read_csv(os.path.join(DATA_DIR, 'country_to_country_type_view.csv'))
media_to_country_view = pd.read_csv(os.path.join(DATA_DIR, 'media_to_country_view.csv.zip'))

## we can already sort out datum with few mentions in inner and outer views since we won't regroup them
country_inner_view = country_inner_view[country_inner_view.count_mentions > MIN_CONSIDER]
country_outer_view = country_outer_view[country_outer_view.count_mentions > MIN_CONSIDER]
country_inner_type_view = country_inner_type_view[country_inner_type_view.count_mentions > MIN_CONSIDER]
country_outer_type_view = country_outer_type_view[country_outer_type_view.count_mentions > MIN_CONSIDER]

In [None]:
country_codes = pd.read_csv(os.path.join(DATA_DIR, 'mapping_country_codes.csv')).drop(
    "Unnamed: 0", axis=1)  # needed for plotly
country_codes.columns = ["country_name", "country_ISO"]

In [None]:
# taken from https://github.com/scikit-learn/scikit-learn/blob/70cf4a676caa2d2dad2e3f6e4478d64bcb0506f7/examples/cluster/plot_hierarchical_clustering_dendrogram.py
def plot_dendrogram(model, **kwargs):
    """
    =========================================
    Plot Hierarachical Clustering Dendrogram 
    =========================================
    This example plots the corresponding dendrogram of a hierarchical clustering
    using AgglomerativeClustering and the dendrogram method available in scipy.
    """

    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
def add_CI_to_DF(df, mean_col_name="avg_tone", std_col_name="std_tone", count_col_name="count_mentions", std_mean_col_name=None, alpha=0.05):
    """
    by the Continuous limit theorem, the mean of a R.V. is distributed as N(\mu, \sigma^2/n)
    We will compute the bounds and add them to the dataframe assuming the CLT holds even tough 
    n is finite
    
    :param df: the dataframe
    :param mean_col_name: the name of the column of df containing the mean of the data
    :param std_col_name: the name of the column of df containing the std of the data
    :param count_col_name: the name of the column of df containing the count of the data
    :param std_mean_col_name: the std of the mean (if this is set, std_col_name and count_col_name are not used)
    :param alpha: the parameter alpha corresponding to the CI (ex: 0.05 for 95% CI)
    """

    interval = norm.interval(1-alpha)
    if std_mean_col_name is None:
        mean_std = df[std_col_name] / (df[count_col_name]**0.5)
    else:
        mean_std = df[std_mean_col_name]
    df[mean_col_name + "_low_CI"] = interval[0]*mean_std + df[mean_col_name]
    df[mean_col_name + "_high_CI"] = interval[1]*mean_std + df[mean_col_name]

In [None]:
def export_to_plotly(df, data_col_name, file_name, country_codes=country_codes, 
                     country_col_name="country"):
    """
    generates a plotly graph from the given dataframe
    
    :param dfs: a dataframe
    :param data_col_name: the name of the column containing the data
    :param file_name: name of the file
    :param country_codes: the df that contains the mapping from country names to country codes
    :param country_col_name: the name of the column of df that contains the country names
    """
    output = df.merge(country_codes, left_on=country_col_name, right_on="country_name",
                          suffixes=('',''))
    output["text"] = output.apply(lambda x: f"{x.country} <br>" + 
    (f"95% CI: [{x[data_col_name + '_low_CI'] :0.3f}, {x[data_col_name + '_high_CI']: 0.3f}] <br>"
    if x.count_mentions > MIN_CI else "") + f"{x.count_mentions: .0E} datapoints", axis=1)
    data = [ dict(
            type = 'choropleth',
            locations = output.country_ISO,
            z = np.round(output[data_col_name], 3),
            text = output.text,
            autocolorscale = True,
            reversescale = True,
            colorbar = dict(
                autotick = False,
                title = 'TODO'),
          ) ]

    layout = dict(
        title = 'TODO',
        geo = dict(
            showframe = False,
            showcoastlines = True,
            projection = dict(
                type = 'Mercator'
            )
        )
    )

    fig = dict(data=data, layout=layout)
    py.iplot(fig, validate=False, filename=file_name)

#### Countries ranking
First, we simply save the outer views of each country. We then try to take into account the score of a country to weight the tone he uses to talk about other countries, unfortunately, most of the countries scores are negative, which means that a country referenced by a lot of other countries in a negative way will be positively impacted (e.g. USA). A solution to this problem would be to change the avg_tones (uniformy while modifying them as little as possible) so that all the countries are positively coted and to sort that data.

In [None]:
add_CI_to_DF(country_outer_view)
# export_to_plotly(country_outer_view, "avg_tone", "outer_view_base_plot")  # uncomment to export to plotly

In [None]:
# decreasing the weight of a country with bad reputation to compute the reputation of another 
# country
c_to_c_transition = country_to_country_view[["actor_country", "mention_country", "avg_tone", 
                                              "count_mentions", "avg_weighted_tone"]]
# other columns cannot be splitted from country to country

c_to_c_transition = c_to_c_transition[c_to_c_transition.actor_country !=
                                             c_to_c_transition.mention_country]


# get the count of articles for each actor_country and weight avg_tone and avg_weighted_tone by 
# this, i.e. avg_tone = avg_tone * count / (sum count for actor_country)
counts = c_to_c_transition.groupby("actor_country")["count_mentions"].agg("sum")
c_to_c_transition = c_to_c_transition.merge(counts.to_frame(), left_on="actor_country",
                                            right_on="actor_country")
c_to_c_transition.avg_tone = (c_to_c_transition.avg_tone + 7)* c_to_c_transition.count_mentions_x / \
                            c_to_c_transition.count_mentions_y
# the value of 7 is chosen (by trial and error) so that it is as small as possible (the bigger 
# this value, the less tones are different and the the more "uniform" the data becomes) and that
# there are no countries with a negative score (being criticized by these countries would mean
# we are good and initially, nearly all countries were negatives)
c_to_c_transition = c_to_c_transition.drop(["count_mentions_x", "count_mentions_y"], axis=1)


# putting the country to country transitions in the form of a np matrix (graph transition matrix
# ) instead of using the power method, we will compute teh eigenvalues/eigenvectors by
# using numpy "eig" function
index_to_country = np.unique(country_to_country_view.actor_country.values)
country_to_index = {country: index for index, country in enumerate(index_to_country)}

transition_matrix = np.zeros((index_to_country.shape[0], index_to_country.shape[0]))
for row in c_to_c_transition.itertuples(index=False):
    if row.mention_country in country_to_index:  # we do not take the country that don't have
        # mentions from other countries into account
        transition_matrix[country_to_index[row.actor_country],
                      country_to_index[row.mention_country]] = row.avg_tone

eig_val, eig_vec = np.linalg.eig(transition_matrix)

# get the biggest read eigenvalue associated with a read eigenvector
eig_vec = eig_vec[:, np.isreal(eig_val)]
eig_val = eig_val[np.isreal(eig_val)].real
real_eig_vec = np.sum(np.iscomplex(eig_vec), 0) == 0  # real eigenvectors
eig_val = eig_val[real_eig_vec]
eig_vec = eig_vec[:, real_eig_vec].real

max_eig_val_idx = np.argmax(eig_val)
eig_vec = eig_vec[:, max_eig_val_idx]
eig_val = eig_val[max_eig_val_idx]

country_view = pd.DataFrame(data=eig_vec, index=index_to_country, columns=["avg_tone_graph"])

In [None]:
# compare both rankings, we only compute the ranking for the 40 biggest countries
country_view = country_view.merge(country_outer_view[country_outer_view.count_mentions > 1e6], left_index=True, right_on="country")
country_view = country_view[["country", "avg_tone", "avg_tone_graph"]]

# we will use the indexes to get the rankings
country_view = country_view.sort_values(by="avg_tone", ascending=False).reset_index()
country_view["normal_ranking"] = country_view.index + 1
country_view = country_view.sort_values(by="avg_tone_graph", ascending=False).reset_index()
country_view["graph_ranking"] = country_view.index + 1
country_view["places_won_with_graph"] = country_view["normal_ranking"] - country_view["graph_ranking"]
country_view = country_view[["country", "normal_ranking", "graph_ranking", "places_won_with_graph"]]
country_view = country_view.sort_values(by="places_won_with_graph", ascending=False)

In [None]:
country_view.head(10)

In [None]:
country_view.tail(10)

#### Inner - Outer View
computes the difference between the inner and outer view for each country

In [None]:
cols_to_keep = ["country", "avg_tone", "std_tone", "count_mentions"]
diff_in_out = country_inner_view[cols_to_keep].merge(country_outer_view[cols_to_keep], 
    right_on="country", left_on="country")
diff_in_out["difference"] = diff_in_out["avg_tone_x"] - diff_in_out["avg_tone_y"]
diff_in_out["count_mentions"] = np.minimum(diff_in_out.count_mentions_x, 
                diff_in_out.count_mentions_y)  # we take the minimum between inner and
# outer count_mentions as number of count_mentions for the difference

# compute the confidence intervals
# std_mean(inner-outer) = sqrt(std_mean(inner)^2 + std_mean(outer)^2) -- assuming independence 
diff_in_out["std_diff"] = (diff_in_out.std_tone_x**2 / diff_in_out.count_mentions_x + \
                          diff_in_out.std_tone_y**2 / diff_in_out.count_mentions_y) ** 0.5
add_CI_to_DF(diff_in_out, mean_col_name="difference", std_mean_col_name="std_diff")

In [None]:
# export_to_plotly(diff_in_out, "difference", "difference_plot")  # uncomment to export to plotly

#### Analyzing the influence of actor types

In [None]:
actor_types = ["COP", "EDU", "GOV", "JUD", "LEG", "MED", "MNC", "MIL"]
actor_types_description = {"COP": "Police forces", "GOV": "Government", "EDU": "Education",
                          "JUD": "Judiciary", "LEG": "Legislature", "MED": "Media",
                           "MNC": "Multinational corporation", "MIL": "Military"}
country_outer_type_view = country_outer_type_view[country_outer_type_view.actor_type.apply(
    lambda x: x in actor_types)]

In [None]:
country_outer_type_view

In [None]:

c_to_t_selected_type = country_to_type_view[country_to_type_view.actor_type == selected_type]
select_out = c_to_t_selected_type.apply(lambda x: x.actor_country != x.mention_country, axis=1)
outer_view_type = c_to_t_selected_type[select_out].copy()
outer_view_type["sum_mentions"] = outer_view_type.groupby("actor_country").count_mentions.transform("sum")
outer_view_type = outer_view_type.groupby("actor_country").apply(lambda x: x.avg_tone * x.count_mentions \
                                        / x.sum_mentions).groupby("actor_country").sum()

inner_view_type = c_to_t_selected_type[np.logical_not(select_out)]

In [None]:
inner_view_type_rel = inner_view_type[inner_view_type["count_events"] > 1000]
outer_view_type_rel = pd.DataFrame(outer_view_type, columns=["avg_tone"])
inner_outer_view = inner_view_type_rel[["actor_country", "avg_tone"]].merge(outer_view_type_rel, left_on="actor_country", 
                                            right_index=True, suffixes=("_in", "_out"))
inner_outer_view.reset_index(inplace=True)
fig, ax = plt.subplots(figsize=(16,5))
ax.scatter(x='avg_tone_in', y='avg_tone_out', data=inner_outer_view)
ax.set_title("inner vs outer view", fontsize=15, fontweight='bold')
ax.set_ylabel('outer_view')
ax.set_xlabel('inner_view')
ax.grid(True)

for i, txt in enumerate(inner_outer_view.actor_country):
    ax.annotate(txt, (inner_outer_view.avg_tone_in[i], inner_outer_view.avg_tone_out[i]))

#### clustering with or without actor_types

In [None]:
# taking the biggest countries in term of references
selected_type = "GOV"
big_countries = list(country_outer_view.sort_values(by="count_mentions", 
                                                    ascending=False).head(40).country)
# only uses mentions where actor_type is selected_type
#to_cluster = country_to_type_view[country_to_type_view.apply(lambda x: x.actor_country in big_countries 
#            and x.mention_country in big_countries and x.actor_type == selected_type , axis = 1)].copy()

# all actor types
to_cluster = country_to_country_view[country_to_country_view.apply(lambda x: x.actor_country in big_countries 
                                         and x.mention_country in big_countries, axis = 1)].copy()

# proportion of time a mention from mention_country is about actorcountry
to_cluster["prop_mentions"] = to_cluster["count_mentions"] / to_cluster.groupby("mention_country").count_mentions.transform("sum")

to_cluster = to_cluster[["actor_country", "mention_country", "avg_tone", "prop_mentions"]]
countries = np.intersect1d(to_cluster.actor_country.unique(), to_cluster.mention_country.unique())
to_cluster = to_cluster.set_index(["actor_country", "mention_country"])  # so that access is fast

In [None]:
opinions = np.zeros((countries.shape[0], countries.shape[0]))  # if there are no mentions about
# that country, the count should be of 0
for i in range(opinions.shape[0]):
    for j in range(opinions.shape[0]):
        try:
            #if countries[i] in ["United States"]:
                elem = to_cluster.loc[countries[i], countries[j]]
                opinions[j,i] = elem["avg_tone"] #* elem["prop_mentions"] * 1000  # the 1000 is just
                # here to avoid having too small numbers for readability of the dataframes
                # opinions[j,i] because we categorize each country by the way he sees the other
                # ones
        except KeyError as e:
            pass  # in this case we leave the default value in the array since there is no mention

In [None]:
opinions = opinions[:, np.sum(opinions, 0) != 0]
model = AgglomerativeClustering(n_clusters=13, affinity="l1", linkage="average")
clusters = model.fit_predict(opinions)
plot_dendrogram(model, labels=model.labels_)

for i in range(np.max(clusters)+1):
    print(countries[clusters == i])

#### use the different types of actors of a country to compute several ranks and compare these between countries with confidence intervals.

In [None]:
big_countries_outer_view = country_outer_view[country_outer_view.apply(lambda x:
            x.country in big_countries and x.count_mentions > 10000, axis=1)].copy()
add_CI_to_DF(big_countries_outer_view)

big_countries_inner_view = country_inner_view[country_inner_view.apply(lambda x:
            x.country in big_countries and x.count_mentions > 10000, axis=1)].copy()
add_CI_to_DF(country_inner_view)


actor_types = ["COP", "EDU", "GOV", "JUD", "LEG", "MED", "MNC", "MIL"]

big_countries_outer_type_view = country_outer_type_view[country_outer_type_view.apply(lambda x:
 x.country in big_countries and x.actor_type in actor_types and x.count_mentions > 10000, axis=1)].copy()
add_CI_to_DF(big_countries_outer_type_view)

big_countries_inner_type_view = country_inner_type_view[country_inner_type_view.apply(lambda x:
 x.country in big_countries and x.actor_type in actor_types and x.count_mentions > 10000, axis=1)].copy()
add_CI_to_DF(big_countries_inner_type_view)

In [None]:
big_countries_outer_view[["country", "avg_tone", "avg_tone_low_CI", "avg_tone_high_CI"]].sort_values("avg_tone", ascending=False)

In [None]:
big_countries_inner_type_view[big_countries_inner_type_view.actor_type == "COP"]\
[["country", "avg_tone", "avg_tone_low_CI", "avg_tone_high_CI"]].sort_values(
    "avg_tone", ascending=False)

#### Inner media variance
We will sort sources of each country by importance (growing number of news) and group progressively the consecutive sources to form groups that have the same count of mentions but that contains new of very different sizes.

In [None]:
def get_source_importance(ser, nbr_clutsers):
    """
    given a serie containing the news sources of a country sorted by number of mentions and
    their number of mentions,
    return a label(0 to nbr_clutsers-1) for each of these sources so that each label contains
    as much mentions and that a smaller label contains smaller sources
    """
    cumsum = ser.cumsum()
    total_nbr_mentions = cumsum.tail(1)
    offsets = np.array([i * total_nbr_mentions/nbr_clutsers for i in range(nbr_clutsers)]).flatten()
    # lower bound for each cluster
    return cumsum.apply(lambda x: np.searchsorted(offsets, x) - 1) # offset in list

In [None]:
source_cluster = media_to_country_view.groupby(["source_country",
                                            "source_name"]).agg({"count_mentions": "sum"})
source_cluster = source_cluster.sort_values(["source_country", "count_mentions"])
source_cluster["cluster"] = source_cluster.groupby("source_country").count_mentions.apply(
    lambda x: get_source_importance(x, 2))
source_cluster = source_cluster.reset_index()[["source_name", "cluster"]]

news_to_country = media_to_country_view.merge(source_cluster, left_on="source_name",
                                        right_on="source_name", suffixes=('',''))
news_to_country = news_to_country.groupby(["actor_country", "source_country", 
            "cluster"]).agg({"avg_tone": "mean", "count_mentions": "sum"})

In [None]:
news_to_country