# From tables to graphs (draft version)

In [2]:
import pandas as pd
import numpy as np
import networkx as nx

# 1. Loading the data

In [4]:
comments = pd.read_csv('clean_data/comments_ml.csv',parse_dates=[-3]).drop('Unnamed: 0',axis=1)
votes = pd.read_csv('clean_data/votes_ml.csv',parse_dates=[-4]).drop('Unnamed: 0',axis=1)
inters = pd.read_csv('clean_data/interactions_ml.csv').drop('Unnamed: 0',axis=1)
target = pd.read_csv('clean_data/target_ml.csv').rename(columns={'votes_num':'target'})

# 2. Employee Features

## 2.1 Vote features

In [5]:

vote_feats = votes.groupby(['uid'])['vote'].agg({'votes_1': lambda x: len(x[x==1]),
                                                        'votes_2': lambda x: len(x[x==2]),
                                                        'votes_3': lambda x: len(x[x==3]),
                                                        'votes_4': lambda x: len(x[x==4]),
                                                        'votes_mean': lambda x: x.mean(),
                                                        'votes_std': lambda x: x.std(),
                                                       'votes_num': lambda x: len(x),
                                                       }).copy().fillna(0)

is deprecated and will be removed in a future version


## 2.2 Comments features

In [6]:
likes_feats = comments.groupby(['uid'])['likes'].agg({
                                          'likes_num': lambda x: len(x),
                                          'likes_mean': lambda x: x.mean(),
                                          'likes_std': lambda x: x.std(),
                                          'likes_sum': lambda x: x.sum(),
                                                       })
dislikes_feats = comments.groupby(['uid'])['dislikes'].agg({
                                          'dislikes_num': lambda x: len(x),
                                          'dislikes_mean': lambda x: x.mean(),
                                          'dislikes_std': lambda x: x.std(),
                                          'dislikes_sum': lambda x: x.sum(),
                                                       })
coms_feats = comments.dropna().groupby(['uid'])['comment'].agg({'com_num': lambda x: len(x),
                                                             'com_mean': lambda x: x.apply(len).mean(),
                                                             'com_std': lambda x: x.map(len).std(),
                                                             'com_sum': lambda x: x.map(len).sum()})

is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version


## 2.3 Merge all the features

In [7]:
features = pd.merge(vote_feats,likes_feats,left_index=True,right_index=True,how='outer')
features = pd.merge(features,dislikes_feats,left_index=True,right_index=True,how='outer')
features = pd.merge(features,coms_feats,left_index=True,right_index=True,how='outer')
features = pd.merge(features,target,left_index=True,right_on='uid',how='right')#fill users with 0 comments
E_features = features.copy()
E_features.rename(columns=dict([(x,'E_'+str(x)) for x in E_features.columns]),inplace=True)

In [8]:
E_features.to_csv('features/e_feats.csv')

In [9]:
features.columns

Index(['votes_2', 'votes_num', 'votes_3', 'votes_1', 'votes_mean', 'votes_4',
       'votes_std', 'likes_num', 'likes_mean', 'likes_std', 'likes_sum',
       'dislikes_num', 'dislikes_std', 'dislikes_sum', 'dislikes_mean',
       'com_std', 'com_num', 'com_mean', 'com_sum', 'companyAlias', 'employee',
       'first_date', 'lastParticipationDate', 'stillExists', 'churn',
       'comp_short', 'uid'],
      dtype='object')

# Graph employee features

In [10]:
import networkx as nx
def link_all(users_index,comments,inter):
    """Returns a graph containing relationships of like/dislike among employees"""
    g = nx.DiGraph()
    #One node for every employee storing its employee and company ids
    for ix,x in users_index.iterrows():
        if not g.has_node(x['uid']):
            c,u = x['uid'].split('_')
            g.add_node(x['uid'],company=c,employee=u)
    #adding links to a graph based on likes/dislikes
    com_ids = comments['commentId'].unique()
    users = comments['uid'].unique().tolist()
    print("linking {} comments from {} users".format(len(com_ids),len(users)))
    for com in com_ids:
        #who frite current comment
        writer = comments[comments['commentId']==com]['uid'].values[0]
        #info about com
        df = inter[inter['commentId']==com]
        #people who disliked the comment
        haters = df[df['liked']==0]['uid'].values.copy().tolist()
        #people who like the comment
        likers = df[df['liked']==1]['uid'].values.copy().tolist()
        for i,u in enumerate(likers):
            if not g.has_edge(u,writer):
                g.add_edge(u,writer,int_sum=1,interactions=1,liked=1,disliked=0)
            else:
                g.edge[u][writer]['interactions'] += 1
                g.edge[u][writer]['int_sum'] += 1
                g.edge[u][writer]['liked'] += 1

        for i,u in enumerate(haters):
            if not g.has_edge(u,writer):
                g.add_edge(u,writer,int_sum=-1,interactions=1,liked=0,disliked=1)
            else:
                g.edge[u][writer]['interactions'] += 1
                g.edge[u][writer]['disliked'] += 1
                g.edge[u][writer]['int_sum'] -= 1
    return g     

def add_info_to_graph(features,G):
    """Adds the information contained in a DataFrame to a networkx graph"""
    for n in G.nodes_iter():
        fs = features.ix[n].to_dict()
        G.add_node(n,**fs)
    return G

def add_rel_likes_metrics(g):
    """Adds infomation to edges about the relative number of likes a given employee gave to another."""
    for src in g.nodes_iter():
        t_rel = 0
        neigh = nx.neighbors(g,src)
        f = 1 if len(neigh)==0 else len(neigh)
        for dst in neigh:
            rel = g.edge[src][dst]['liked']/g.edge[src][dst]['interactions']
            g.edge[src][dst]['rel_agree'] = rel
            t_rel += rel
        g.node[src]['mean_agree'] = t_rel/f

def calculate_metrics(g,metrics,weight=None):
    """Returns a DataFrame containing the metrics of a given graph.
    It will calculate the metrics both in the directed and  undirected version opf the graph"""
    graph_df = pd.DataFrame()
    nw =  '' if weight is None else weight+'_'
    for met in metrics:
        graph_df['G_'+nw+met.__name__] = pd.Series(met(g))
        graph_df['G_'+nw+'w_'+met.__name__] = pd.Series(met(g,weight=weight))
        graph_df['G_'+nw+met.__name__+'_u'] = pd.Series(met(g.to_undirected()))
        graph_df['G_'+nw+'w_'+met.__name__+'_u'] = pd.Series(met(g.to_undirected(),weight=weight))
    #Normalize metrics from 0 to 1
    graph_df = (graph_df-graph_df.min(axis=0))/(graph_df.max(axis=0)-graph_df.min(axis=0))
    return graph_df

from sklearn.decomposition import NMF
def get_clusters(g, weight=None,n_components=2,companyAlias=''):
    """Return a DataFrame containing the NMF clustering information of a given graph"""
    nw =  '' if weight is None else weight+'_'
    #Calculate on directed version of the graph
    model = NMF(n_components=n_components)
    X = nx.adjacency_matrix(g,weight=weight).todense()
    if (X < 0).any():
        X = np.abs(X)#NMF only accepts non-negative values
    communities = model.fit_transform(X)
    cd = pd.DataFrame(index=list(g.nodes_iter()),
                      columns=['G_'+nw+'NMF'+str(i)+'_d' for i in range(1,n_components+1)],
                      data=communities) 
    #Undirected version
    Xd = nx.adjacency_matrix(g.to_undirected(),weight=weight).todense()
    if (Xd < 0).any():
        Xd = np.abs(Xd)
    communities = model.fit_transform(Xd)
    clusters = cd.combine_first(pd.DataFrame(index=cd.index,
                                             columns=['G_'+nw+'NMF'+str(i)+'_u' for i in range(1,n_components+1)]
                                             ,data=communities))
    return clusters 

def calculate_graph_features(G,features):
    add_rel_likes_metrics(G)
    ng = add_info_to_graph(features=features,G=G.copy())
    
    met_funcs = [nx.degree, nx.betweenness_centrality]
    graph_met_rel = calculate_metrics(ng,met_funcs,weight='rel_agree')
    cluster_rel = get_clusters(ng,weight='rel_agree')
    agree_feats = pd.merge(graph_met_rel,cluster_rel,left_index=True,right_index=True,how='outer')
    graph_met_int = calculate_metrics(ng,met_funcs,weight='interactions')
    cluster_int = get_clusters(ng,weight='interactions')
    int_feats = pd.merge(graph_met_int,cluster_int,left_index=True,right_index=True,how='outer')
    graph_feats = pd.merge(int_feats,agree_feats,left_index=True,right_index=True,how='outer')
    #final_features = pd.merge(features,graph_feats,left_index=True,right_index=True,how='outer')
    ng = add_info_to_graph(features=graph_feats,G=ng.copy())
    
    return graph_feats,ng

In [9]:
G_raw = link_all(target,comments,inters)

linking 17919 comments from 900 users


In [16]:
EG_features,g_total = calculate_graph_features(G_raw.copy(),features.set_index('uid'))
EG_features.head()

Unnamed: 0,G_interactions_degree,G_interactions_w_degree,G_interactions_degree_u,G_interactions_w_degree_u,G_interactions_betweenness_centrality,G_interactions_w_betweenness_centrality,G_interactions_betweenness_centrality_u,G_interactions_w_betweenness_centrality_u,G_interactions_NMF1_d,G_interactions_NMF1_u,...,G_rel_agree_degree_u,G_rel_agree_w_degree_u,G_rel_agree_betweenness_centrality,G_rel_agree_w_betweenness_centrality,G_rel_agree_betweenness_centrality_u,G_rel_agree_w_betweenness_centrality_u,G_rel_agree_NMF1_d,G_rel_agree_NMF1_u,G_rel_agree_NMF2_d,G_rel_agree_NMF2_u
0_25,0.080952,0.052985,0.084615,0.066792,0.005056,0.010556,0.007113,0.006652,0.0,0.0,...,0.084615,0.098374,0.005056,1.389244e-07,0.007113,1.9729119999999997e-19,0.0,0.0,0.0,0.0
0_259,0.090476,0.05687,0.084615,0.06367,0.008131,0.031115,0.007113,0.001765,0.0,0.0,...,0.084615,0.085043,0.008131,3.989625e-07,0.007113,7.842933e-19,0.0,0.0,0.0,0.774894
0_271,0.02381,0.004945,0.023077,0.003745,0.0,0.002262,0.0,0.0,0.0,0.0,...,0.023077,0.021584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0_277,0.061905,0.012363,0.061538,0.016854,0.000742,0.02825,0.000343,0.010724,0.0,0.0,...,0.061538,0.074004,0.000742,7.124331e-09,0.000343,0.0,0.0,0.0,0.0,0.0
0_278,0.066667,0.02402,0.069231,0.026841,0.000816,0.002061,0.001114,0.001222,0.0,0.0,...,0.069231,0.080171,0.000816,0.0,0.001114,0.0,0.0,0.0,0.0,0.0


In [17]:
EG_features.to_csv('features/eg_feats.csv')

# 3. Company features

## 3.1 Comapany vote features

In [11]:
comp_vote_feats = votes.groupby(['comid'])['vote'].agg({'votes_1': lambda x: len(x[x==1]),
                                                        'votes_2': lambda x: len(x[x==2]),
                                                        'votes_3': lambda x: len(x[x==3]),
                                                        'votes_4': lambda x: len(x[x==4]),
                                                        'votes_mean': lambda x: x.mean(),
                                                        'votes_std': lambda x: x.std(),
                                                       'votes_num': lambda x: len(x),
                                                       }).copy()

is deprecated and will be removed in a future version


## 3.2 Comments features

In [12]:
comp_likes_feats = comments.groupby(['comid'])['likes'].agg({
                                          'likes_num': lambda x: len(x),
                                          'likes_mean': lambda x: x.mean(),
                                          'likes_std': lambda x: x.std(),
                                          'likes_sum': lambda x: x.sum(),
                                                       })
comp_dislikes_feats = comments.groupby(['comid'])['dislikes'].agg({
                                          'dislikes_num': lambda x: len(x),
                                          'dislikes_mean': lambda x: x.mean(),
                                          'dislikes_std': lambda x: x.std(),
                                          'dislikes_sum': lambda x: x.sum(),
                                                       })
comp_coms_feats = comments.dropna().groupby(['comid'])['comment'].agg({'com_num': lambda x: len(x),
                                                             'com_mean': lambda x: x.apply(len).mean(),
                                                             'com_std': lambda x: x.map(len).std(),
                                                             'com_sum': lambda x: x.map(len).sum()})

is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version


## 3.3 Merging and aggregating company features

In [13]:
comp_features = pd.merge(comp_vote_feats,comp_likes_feats,left_index=True,right_index=True,how='outer')
comp_features = pd.merge(comp_features,comp_dislikes_feats,left_index=True,right_index=True,how='outer')
comp_features = pd.merge(comp_features,comp_coms_feats,left_index=True,right_index=True,how='outer')
#features = pd.merge(features,users_index.set_index('uid')[['churn']],left_index=True,right_index=True,how='left')
#features.set_index('uid'),'ui
comp_df = comp_features.fillna(1)
C_features_raw = comp_df.applymap(lambda x: 1 if x==0 else x)


## 3.4 Creating company wide features

In [14]:
C_features = pd.DataFrame(index=target.uid,columns=C_features_raw.columns)
for i in range(len(target.uid)):
    company = int(C_features.index.values[i].split('_')[0])
    C_features.iloc[i,:] = C_features_raw.ix[company].values

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


In [15]:
C_features.to_csv('features/c_feats.csv')

# 4. Employee-Company features

In [16]:
features['comid'] = features['uid'].map(lambda x: x.split('_')[0] )
comments['comid'] = comments['uid'].map(lambda x: x.split('_')[0] )
inters['comid'] = inters['uid'].map(lambda x: x.split('_')[0] )
target['comid'] = target['uid'].map(lambda x: x.split('_')[0] )

In [17]:
rel_feats = features.set_index('uid')[C_features_raw.columns.values.tolist()+['comid']].copy()
companies = C_features_raw.index.values.tolist()
for comp in companies:
    rel_feats.loc[rel_feats['comid']==int(comp)] = rel_feats.loc[rel_feats['comid']==int(comp)]/comp_df.ix[comp]
#rel_feats.drop('comid',axis=1,inplace=True)

rel_feats.rename(columns=dict([(x,'CE_'+str(x)) for x in rel_feats.columns]),inplace=True)

CE_features = rel_feats.copy().fillna(0.)
CE_features.to_csv('features/ce_feats.csv')

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


# 5. Employee-Company Graph features

In [18]:
def one_company_graph_data(comp,target,comments,inters,features):
    g_comps = {}
    feats = pd.DataFrame()
    comp = str(comp)
    ui = target[target['comid']==comp].copy()
    co = comments[comments['comid']==comp].copy()
    ints = inters[inters['comid']==comp].copy()
    g_c = link_all(ui,co,ints)
    if g_c.node!={}:
        g_comps[comp] = g_c
        _features,g_c = calculate_graph_features(g_c,features[features['CE_comid'].astype(int)==int(comp)].copy())
        feats = _features.copy()
    return feats,g_comps

rgdf,g_comps  = one_company_graph_data(companies[0],target,comments,inters,rel_feats)

for comp in companies[1:]:
    print("calculating comp : {}".format(comp))
    _rgdf,_g_comps  = one_company_graph_data(comp,target,comments,inters,rel_feats)
    g_comps.update(_g_comps)
    rgdf = pd.concat([rgdf,_rgdf])

linking 396 comments from 24 users


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


calculating comp : 3
linking 80 comments from 4 users
calculating comp : 4
linking 3770 comments from 129 users
calculating comp : 5
linking 10 comments from 1 users
calculating comp : 6
linking 1862 comments from 109 users
calculating comp : 11
linking 3443 comments from 173 users
calculating comp : 12
linking 1124 comments from 43 users
calculating comp : 13
linking 553 comments from 27 users
calculating comp : 16
linking 1885 comments from 117 users
calculating comp : 17
linking 1010 comments from 55 users
calculating comp : 18
linking 1672 comments from 135 users
calculating comp : 19
linking 186 comments from 6 users
calculating comp : 20
linking 777 comments from 73 users
calculating comp : 21
linking 737 comments from 75 users
calculating comp : 22
linking 496 comments from 31 users
calculating comp : 23
linking 467 comments from 27 users
calculating comp : 24
linking 655 comments from 56 users
calculating comp : 25
linking 37 comments from 3 users
calculating comp : 26
linking 

In [31]:
CEG_features = rgdf.rename(columns=dict([(x,'CE'+str(x)) for x in rgdf.columns])).copy()
CEG_features.to_csv('features/ceg_feats.csv')

In [22]:
example_g = g_comps['12'].copy()

In [29]:
e_cols = ['E_likes_num','E_dislikes_sum','E_votes_mean','E_churn','E_likes_std']
for ix,row in E_features.iterrows():
    uid = row['E_uid']
    for col in e_cols:
        if uid in example_g.node.keys():
            example_g.node[uid][col] = row[col]

In [38]:
e_cols = ['G_interactions_NMF1_d','G_interactions_NMF2_d','G_interactions_w_betweenness_centrality',
          'G_rel_agree_NMF1_d','G_rel_agree_NMF2_d']
for ix,row in rgdf.iterrows():
    uid = ix
    for col in e_cols:
        if uid in example_g.node.keys():
            example_g.node[uid][col] = row[col]

In [39]:
example_g.node

{'12_1': {'E_churn': 0,
  'E_dislikes_sum': 59.0,
  'E_likes_num': 40.0,
  'E_likes_std': 3.4898203612573178,
  'E_votes_mean': 2.9019607843137254,
  'G_interactions_NMF1_d': 1.8875773547230237,
  'G_interactions_NMF2_d': 0.79073150266341763,
  'G_interactions_w_betweenness_centrality': 0.16912605107695311,
  'G_rel_agree_NMF1_d': 0.72659358943394292,
  'G_rel_agree_NMF2_d': 0.42776669275017121,
  'company': '12',
  'employee': '1',
  'mean_agree': 0.8112778343660697},
 '12_101': {'E_churn': 0,
  'E_dislikes_sum': 1.0,
  'E_likes_num': 2.0,
  'E_likes_std': 0.7071067811865476,
  'E_votes_mean': 3.6785714285714284,
  'G_interactions_NMF1_d': 0.35227397886313921,
  'G_interactions_NMF2_d': 0.065812584439866484,
  'G_interactions_w_betweenness_centrality': 0.088096511219743295,
  'G_rel_agree_NMF1_d': 0.62934202595582145,
  'G_rel_agree_NMF2_d': 0.0,
  'company': '12',
  'employee': '101',
  'mean_agree': 0.8111111111111111},
 '12_103': {'E_churn': 0,
  'E_dislikes_sum': 15.0,
  'E_likes_

In [41]:
import pickle
with open('G_comp.pck','wb') as f:
    pickle.dump(example_g,f)