In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
from math import sqrt
import pickle
import operator

from bokeh.core.properties import value
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show
import math
from bokeh.transform import dodge
import bokeh
output_notebook()
from bokeh.models import PrintfTickFormatter

### Preprocessing data

In [2]:
#Create the dataframe with all the words from "file_name" from 2007 to 2018
def joining_tables(file_name):
    arts = list(set(pd.read_csv('data/pageviews/Time_Pageviews.csv', index_col='Articles').index))
    df_prev = pd.read_csv('data/pageviews/previous_pageviews_en.csv', index_col=0)
    df_prev.columns = pd.to_datetime(df_prev.columns, format='%Y-%m')
    df_prev = df_prev.sort_index(axis=1)

    df_act = pd.read_csv('data/pageviews/Time_Pageviews.csv', index_col='Articles')
    df_act = df_act[df_act.lang=='en'].drop(['lang', 'name', 'name_utf'], axis = 1)

    words = list(pd.read_table(file_name+'.txt', header=-1)[0])
    df_act = df_act.loc[words]

    df_act = df_act[np.isfinite(df_act[df_act.columns[-1]])]
    df_act = df_act.replace(np.nan, 0, regex=True)
    df_act.columns = pd.to_datetime(df_act.columns, format='%Y%m%d00')

    df_prev = df_prev.loc[words]
    df_prev = df_prev[np.isfinite(df_prev[df_prev.columns[-1]])]

    result =  pd.concat([df_prev[df_prev.columns[0:91]], df_act], axis=1)
    return result.replace(np.nan, 0, regex=True)

In [3]:
#Create the dataframe with all the words from "file_name" from 2015 to 2018 in a given language
def get_table_last_years(file_name, lang='en'):
    arts = list(set(pd.read_csv('data/pageviews/Time_Pageviews.csv', index_col='Articles').index))
    df_act = pd.read_csv('data/pageviews/Time_Pageviews.csv', index_col='Articles')
    df_act = df_act[df_act.lang==lang].drop(['lang', 'name', 'name_utf'], axis = 1)

    words = list(pd.read_table(file_name+'.txt', header=-1)[0])
    df_act = df_act.loc[words]

    df_act = df_act[np.isfinite(df_act[df_act.columns[-1]])]
    df_act = df_act.replace(np.nan, 0, regex=True)
    df_act.columns = pd.to_datetime(df_act.columns, format='%Y%m%d00')
    return df_act

### Dynamic Time Warping (DTW)

In [4]:
def DTWDistance(s1, s2, w):
    DTW={}
    
    w = max(w, abs(len(s1)-len(s2)))
    
    for i in range(-1,len(s1)):
        for j in range(-1,len(s2)):
            DTW[(i, j)] = float('inf')
    DTW[(-1, -1)] = 0
  
    for i in range(len(s1)):
        for j in range(max(0, i-w), min(len(s2), i+w)):
            dist= (s1[i]-s2[j])**2
            DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])
    return sqrt(DTW[len(s1)-1, len(s2)-1])

### Comparing dataframes

In [5]:
# Given a dataframe df, and a row, find the closer element in the list
def compare_item_1_table(df, pos):
    minim = np.Inf
    row = 0
    for i in [x for x in range(len(df)) if x != pos]:
        dist = DTWDistance(df.iloc[pos], df.iloc[i], 3)
        if (dist<minim and dist!=0.0):
            minim = dist
            row = i
    return minim, row

In [6]:
#Find for an element of df1 (which is in the row "pos_df1") the "num_elem" closer elements from df2
def compare_item_1vsall(df1, df2, pos_df1, num_elem):
    minim = [[np.Inf,-1]]*num_elem
    item = df1.iloc[pos_df1]
    for i in range(len(df2)):
        dist = DTWDistance(item, df2.iloc[i], 3)
        if (dist<minim[num_elem-1][0]):
            for m in range(num_elem):
                if dist<minim[m][0]:
                    minim.insert(m, [dist, i])
                    del minim[-1]
                    break
    return minim

In [7]:
#Find for each element of df1, the "num_closer" closer elements from df2.
#Remember in general to pass as atribute normalized dataframes.
def dict_closer_art(df1, df2, num_closer=3):
    closer ={}

    for i in range(len(df1)):
        closer[i] = compare_item_1vsall(df1, df2, i, num_closer)
    return closer

def sort_dict_closer(dic):
    return sorted(dic.items(), key=operator.itemgetter(1))

In [8]:
#Given the list of words sorted (which is returned by the method sort_dict_closer) and the number of words that wants to be returned
#returns the number of words with distances different to zero
def get_closer_words(sort_closer,  num_show=10):
    i=0
    word_dist = []
    for elem in sort_closer:
        if (elem[1][0][0]!=0.0):
            word_dist.append(elem)
            i+=1
        if (num_show==i):
            return word_dist
    
    for elem in sort_closer:
        if (elem[1][1][0]!=0.0):
            word_dist.append(elem)
            i+=1
        if (num_show==i):
            return word_dist
    return word_dist

In [9]:
# Print the closest words
def print_words(cl):
    i=0
    for elem in cl:
        print(str(i)+'. Word: \"'+ df1.iloc[elem[0]].name+'\"')
        print('Related to: \"'+ df2.iloc[elem[1][0][1]].name 
              +"\",\""+ df2.iloc[elem[1][1][1]].name +"\",\""+ df2.iloc[elem[1][2][1]].name +"\"\n")
        i+=1

In [10]:
#Given the name of an article, and the dataframe, return the number of the row where it is
def get_position(df, article):
    return [i for i, x in enumerate(df.index==article) if x][0]

### z-score normalization

In [11]:
def norm(df):
    res = df.copy().T
    df_norm = pd.DataFrame(columns=res.columns, index=res.index)
    cols = list(res.columns)
    for col in cols:
        df_norm[col] = (res[col] - res[col].mean())/res[col].std(ddof=0)
    return df_norm.T

In [12]:
def norm_diff_tables(df1, df2):    
    df = pd.concat([df1, df2])
    df = norm(df)
    df1 = df.iloc[0:len(df1)]
    df2 = df.iloc[len(df1):]
    return df1, df2

### Visualization

In [13]:
# Plot to compare different articles in a given language
def plot_articles(df):
    color =["#e6194b","#3cb44b","#ffe119","#0082c8","#f58231","#911eb4","#46f0f0","#f032e6","#d2f53c","#fabebe","#008080","#e6beff","#aa6e28","#fffac8","#800000","#aaffc3","#808000","#ffd8b1","#000080","#808080","#FFFFFF","#000000","#1F77B4", "#B85A0D", '#878787', "#E377C2", '#66ccff', '#cc0066', '#cccc00', '#98DF8A', "#82853B","#92a8d1","#034f84","#f7cac9","#f7786b","#d5f4e6","#80ced6","#fefbd8","#618685","#ffef96","#50394c","#b2b2b2","#f4e1d2","#deeaee","#b1cbbb","#eea29a","#c94c4c","#3e4444","#82b74b","#405d27","#c1946a","#b9936c","#dac292",
"#e6e2d3","#c4b7a6","#6b5b95","#feb236","#d64161","#ff7b25","#eaece5","#b2c2bf","#c0ded9","#3b3a30","#c8c3cc","#563f46","#8ca3a3","#484f4f","#686256",
"#c1502e","#587e76","#a96e5b","#bccad6","#8d9db6","#667292","#f1e3dd","#cfe0e8","#b7d7e8","#87bdd8","#daebe8","#fbefcc","#f9ccac","#f4a688","#e0876a",
"#f9d5e5","#eeac99","#e06377","#c83349","#5b9aa0","#d6d4e0","#b8a9c9","#622569","#96ceb4","#ffeead","#ffcc5c","#ff6f69","#588c7e","#f2e394","#f2ae72","#d96459"]

    p = figure( x_axis_type='datetime',plot_width = 970, title="Number of users who viewed the article")

    for i, art in enumerate(list(df.index)):
        p.line(list(df.columns), list(df.loc[art, :]), color=color[i], legend=value(art), line_width=1.5)
        p.circle(list(df.columns), list(df.loc[art, :]), color=color[i], fill_alpha=0.2, legend=value(art), size=4)
        #p.vbar(x=dodge('articles', pos[i], range=p.x_range), top=art, width=0.17, source=source, 
            #color=color[i], legend=value(art))
    p.xaxis.major_label_orientation = math.pi/3
    p.grid.grid_line_alpha=1
    p.x_range.range_padding = 0.01
    p.legend.location = "top_left"
    p.legend.click_policy="hide"
    show(p)

### Saving data

In [14]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

### Loading dataset

In [16]:
def loading_data_distances(TOPIC=1, ALL_YEARS=False):
    '''
    Return the dataset with the distances from an article to another
    
    Options Availables:

    #TOPIC: 
    =0 : comparing tech with social
    =1 : comparing social with tech
    =2 : comparing social with social 

    #ALL_YEARS:
    =True : Do the trending analysis with all the years that are available (X 2017)
    =False : Do the trending analysis since July 2015    
    '''
    
    if (TOPIC==0):   
        #Preparing the data
        file_name1 = 'NGI_technology_keywords'
        file_name2 = 'NGI_social_keywords'
        path = 'data/dist/dist_tech_with_social'

    elif (TOPIC==1):
        #Preparing the data
        file_name1 = 'NGI_social_keywords'
        #file_name = 'Trending_keywords'
        file_name2 = 'NGI_technology_keywords'
        path = 'data/dist/dist_social_with_tech'

    else:
        #Preparing the data
        file_name1 = 'NGI_social_keywords'
        #file_name = 'Trending_keywords'
        file_name2 = 'NGI_social_keywords'
        path = 'data/dist/dist_social_with_social'

    #Loading the dataset from all the available years
    if ALL_YEARS:
        #Loading the data
        df1 = joining_tables('data/list_words_txt/'+file_name1)
        df2 = joining_tables('data/list_words_txt/'+file_name2)

    #Loading the dataset from the last years
    else:
        df1 = get_table_last_years('data/list_words_txt/'+file_name1)
        df2 = get_table_last_years('data/list_words_txt/'+file_name2)

    #normalizing the data
    df1_norm, df2_norm = norm_diff_tables(df1, df2)
    closer = load_obj(path)

    sorted_closer = sort_dict_closer(closer)
    cl = get_closer_words(sorted_closer, num_show=10)
    return df1_norm, df2_norm, df1, df2, cl

In [17]:
#Processing the dataset to print.
#num_closer: Number of closer articles to print (1<=num_closer<=10)
#pos_article: Position of the article to print
def loading_dataset_x_closer(num_closer=5, pos_article=1, plot_normalized=True):
    if plot_normalized:
        df_plot1 = df1_norm
        df_plot2 = df2_norm
    else:
        df_plot1 = df1
        df_plot2 = df2
    data_print = [df_plot1.iloc[cl[pos_article][0]]]
    for i in range(num_closer):
        data_print.append(df_plot2.iloc[cl[pos_article][1][i][1]])
    return pd.concat(data_print, axis=1).T.drop_duplicates(subset=None, keep='first', inplace=False)

## Creating distances

### Creating data. Social and tech

In [18]:
'''
#Preparing the data
file_name1 = 'NGI_social_keywords'
#file_name = 'Trending_keywords'
file_name2 = 'NGI_technology_keywords'


#-----------------------------------------------
##ALL THE YEARS
##Loading the data
#tech = joining_tables('data/list_words_txt/'+file_name1)
#social = joining_tables('data/list_words_txt/'+file_name2)



#-----------------------------------------------
#LAST YEARS
#Loading the data
tech = get_table_last_years('data/list_words_txt/'+file_name1)
social = get_table_last_years('data/list_words_txt/'+file_name2)


#normalizing the data
tech_norm, social_norm = norm_diff_tables(tech, social)


closer = dict_closer_art(tech_norm, social_norm, num_closer=10)
save_obj(closer, 'data/dist/dist_last_years_social_with_tech')
#dist_tech
'''


"\n#Preparing the data\nfile_name1 = 'NGI_social_keywords'\n#file_name = 'Trending_keywords'\nfile_name2 = 'NGI_technology_keywords'\n\n\n#-----------------------------------------------\n##ALL THE YEARS\n##Loading the data\n#tech = joining_tables('data/list_words_txt/'+file_name1)\n#social = joining_tables('data/list_words_txt/'+file_name2)\n\n\n\n#-----------------------------------------------\n#LAST YEARS\n#Loading the data\ntech = get_table_last_years('data/list_words_txt/'+file_name1)\nsocial = get_table_last_years('data/list_words_txt/'+file_name2)\n\n\n#normalizing the data\ntech_norm, social_norm = norm_diff_tables(tech, social)\n\n\ncloser = dict_closer_art(tech_norm, social_norm, num_closer=10)\nsave_obj(closer, 'data/dist/dist_last_years_social_with_tech')\n#dist_tech\n"

### Creating list of distances. Comparing social dataset with social dataset

#### For the last years

In [19]:
'''
#Preparing the data
file_name1 = 'NGI_social_keywords'
#file_name = 'Trending_keywords'
file_name2 = 'NGI_social_keywords'


##-----------------------------------------------
##ALL THE YEARS
##Loading the data
#tech = joining_tables('data/list_words_txt/'+file_name1)
#social = joining_tables('data/list_words_txt/'+file_name2)


#-----------------------------------------------
#LAST YEARS
#Loading the data
social1 = get_table_last_years('data/list_words_txt/'+file_name1)
social2 = get_table_last_years('data/list_words_txt/'+file_name2)


#normalizing the data
social_norm1, social_norm2 = norm_diff_tables(social1, social2)

closer = dict_closer_art(social_norm1, social_norm2, num_closer=10)
save_obj(closer, 'data/dist/dist_last_years_social_with_social')
#dist_tech
'''

"\n#Preparing the data\nfile_name1 = 'NGI_social_keywords'\n#file_name = 'Trending_keywords'\nfile_name2 = 'NGI_social_keywords'\n\n\n##-----------------------------------------------\n##ALL THE YEARS\n##Loading the data\n#tech = joining_tables('data/list_words_txt/'+file_name1)\n#social = joining_tables('data/list_words_txt/'+file_name2)\n\n\n#-----------------------------------------------\n#LAST YEARS\n#Loading the data\nsocial1 = get_table_last_years('data/list_words_txt/'+file_name1)\nsocial2 = get_table_last_years('data/list_words_txt/'+file_name2)\n\n\n#normalizing the data\nsocial_norm1, social_norm2 = norm_diff_tables(social1, social2)\n\ncloser = dict_closer_art(social_norm1, social_norm2, num_closer=10)\nsave_obj(closer, 'data/dist/dist_last_years_social_with_social')\n#dist_tech\n"

##### For all the years

In [20]:
'''
#Preparing the data
file_name1 = 'NGI_social_keywords'
#file_name = 'Trending_keywords'
file_name2 = 'NGI_social_keywords'


##-----------------------------------------------
##ALL THE YEARS
##Loading the data
#tech = joining_tables(file_name1)
#social = joining_tables(file_name2)


#-----------------------------------------------
#ALL THE YEARS
#Loading the data
social1 = joining_tables('data/list_words_txt/'+file_name1)
social2 = joining_tables('data/list_words_txt/'+file_name2)

#normalizing the data
social_norm1, social_norm2 = norm_diff_tables(social1, social2)

closer = dict_closer_art(social_norm1, social_norm2, num_closer=10)
save_obj(closer, 'data/dist/dist_social_with_social')
'''

"\n#Preparing the data\nfile_name1 = 'NGI_social_keywords'\n#file_name = 'Trending_keywords'\nfile_name2 = 'NGI_social_keywords'\n\n\n##-----------------------------------------------\n##ALL THE YEARS\n##Loading the data\n#tech = joining_tables(file_name1)\n#social = joining_tables(file_name2)\n\n\n#-----------------------------------------------\n#ALL THE YEARS\n#Loading the data\nsocial1 = joining_tables('data/list_words_txt/'+file_name1)\nsocial2 = joining_tables('data/list_words_txt/'+file_name2)\n\n#normalizing the data\nsocial_norm1, social_norm2 = norm_diff_tables(social1, social2)\n\ncloser = dict_closer_art(social_norm1, social_norm2, num_closer=10)\nsave_obj(closer, 'data/dist/dist_social_with_social')\n"

## Testing

In [21]:
'''
Options Availables:

#TOPIC: 
=0 : comparing tech with social
=1 : comparing social with tech
=2 : comparing social with social 

#ALL_YEARS:
=True : Do the trending analysis with all the years that are available (X 2017)
=False : Do the trending analysis since July 2015    
'''
TOPIC = 1

ALL_YEARS=False


#Position of the article to select
pos_article = 1

#umber of closer articles to print
num_closer = 5

#plot the articles normalized, or with the real number of pageviews
plot_normalized = True

df1_norm, df2_norm, df1, df2, cl = loading_data_distances(TOPIC=TOPIC, ALL_YEARS=ALL_YEARS)
plot_articles(loading_dataset_x_closer(num_closer, pos_article, plot_normalized))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


In [22]:
print_words(cl)

0. Word: "Network_transparency"
Related to: "IEEE_802.11","Speech_recognition","Software-defined_infrastructure"

1. Word: "Resilience_(network)"
Related to: "Speech_recognition","Linux_Foundation","IEEE_802.15"

2. Word: "Level_playing_field"
Related to: "IEEE_802.15","Speech_recognition","Internet_Research_Task_Force"

3. Word: "Hyperconnectivity"
Related to: "IEEE_802.15","Content_delivery_network","WiMAX"

4. Word: "Social_objects"
Related to: "IEEE_802.15","Content_delivery_network","Matroska"

5. Word: "Ubiquitous_computing"
Related to: "IEEE_802.11","Smart_grid","Speech_recognition"

6. Word: "Digital_Single_Market"
Related to: "Identity_management","Electronic_identification","Robust_Header_Compression"

7. Word: "Fake_news"
Related to: "CBOR","Robust_Header_Compression","Blockchain"

8. Word: "Robot_tax"
Related to: "CBOR","Blockchain","DOCSIS"

9. Word: "Global_education"
Related to: "Intelligent_environment","Data_Plane_Development_Kit","Cognitive_robotics"

