In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import pickle

In [2]:
def load_data(comments_path='data/anon_comments.csv',
              inters_path='data/commentInteractions.csv',
              lastpart_path='data/lastParticipationExists.csv',
              votes_path='data/votes.csv'):
    comments = comments = pd.read_csv(comments_path,parse_dates=[-1]).drop('Unnamed: 0',axis=1)
    inters = pd.read_csv(inters_path).dropna(how='any').drop_duplicates()
    lastpart = pd.read_csv(lastpart_path,parse_dates=[3]).drop_duplicates()
    votes = pd.read_csv(votes_path,parse_dates=[2]).drop_duplicates()
    return comments,inters,lastpart,votes

In [3]:
def create_users_index(prediction_date,data_collection_ended,comments,votes,lastpart):

    #get first dates on different DataFrames
    min_date_comment = pd.DataFrame(comments.groupby(['companyAlias','employee'])['commentDate'].apply(np.min))
    min_date_votes = pd.DataFrame(votes.groupby(['companyAlias','employee'])['voteDate'].apply(np.min))
    min_date_part = pd.DataFrame(lastpart.groupby(['companyAlias','employee'])['lastParticipationDate'].min())
    #combine them
    min_dates = pd.merge(min_date_comment,min_date_votes,right_index=True,left_index=True,how='outer')
    min_dates = pd.merge(min_dates,min_date_part,right_index=True,left_index=True,how='outer')
    #take the earliest date of the 3 different DataFrames and propagate nans employee wise
    first_dates = pd.DataFrame(min_dates.fillna(method='ffill',axis=1).fillna(method='bfill',axis=1).min(axis=1),
                               columns=['first_date'])
    last_dates = lastpart.set_index(['companyAlias','employee'])[['lastParticipationDate','stillExists']].copy()
    dates = pd.merge(first_dates,last_dates,right_index=True,left_index=True,how='outer')
    #take only users in date range
    c_first = dates['first_date']<=pd.to_datetime(prediction_date)
    c_last = dates['lastParticipationDate']>pd.to_datetime(prediction_date)#canvi
    users_index = dates[c_first&c_last].copy()
    return users_index

In [4]:

    
def add_UIDs(users_index,votes,inters,comments,lastpart):
    alias_comp = dict([(x,i) for i,x in enumerate(np.unique(users_index.index.levels[0].values).tolist())])
    def make_uid(x):
        df = x.reset_index().copy()
        return df['companyAlias'].map(lambda x: str(alias_comp[x])).values+'_'+df['employee'].map(str).values

    inters['uid'] = make_uid(inters)
    inters['comid'] = inters['uid'].map(lambda x: x.split('_')[0]).values
    votes['uid'] = make_uid(votes)
    votes['comid'] = votes['uid'].map(lambda x: x.split('_')[0]).values
    comments['uid'] = make_uid(comments)
    comments['comid'] = comments['uid'].map(lambda x: x.split('_')[0]).values
    lastpart['uid'] = make_uid(lastpart)
    lastpart['comid'] = lastpart['uid'].map(lambda x: x.split('_')[0]).values
    users_index['uid'] = make_uid(users_index)
    users_index['comid'] = users_index['uid'].map(lambda x: x.split('_')[0]).values

    inv_comp = dict([(val,key) for key,val in alias_comp.items()])
    return users_index, votes, inters, comments, lastpart, inv_comp
    

    

In [5]:
def filter_by_uids(votes,comments,lastpart,inters, valid_employees):
    filter_users = lambda x: x['uid'] in valid_employees
    #create index of valid employees
    coms_ix = comments.apply(filter_users,axis=1)
    lastp_ix = lastpart.apply(filter_users,axis=1)
    votes_ix = votes.apply(filter_users,axis=1)
    ints_ix = inters.apply(filter_users,axis=1)
    inters_clean = inters[ints_ix].copy()
    votes_clean = votes[votes_ix].copy()
    coms_clean = comments[coms_ix].copy()
    lastp_clean = lastpart[lastp_ix].copy()
    return votes_clean, coms_clean, lastp_clean, inters_clean

In [6]:
def filter_interactions(votes,
                        comments,
                        inters,
                        prediction_date,
                        data_collection_ended):

    #True before prediction date
    target_votes_prev = votes['voteDate']<=pd.to_datetime(prediction_date)
    valid_comments = comments[comments.commentDate<=pd.to_datetime(prediction_date)].commentId.unique()
    #True before prediction date
    target_votes_start = votes['voteDate']>pd.to_datetime(prediction_date)
    #True before data collection ended
    target_votes_end = votes['voteDate']<=pd.to_datetime(data_collection_ended)
    
    #num votes before prediction date
    num_votes_pre = votes[target_votes_prev].groupby(['uid'])['vote'].agg({'votes_num': lambda x: len(x)}).copy().fillna(0)
    #num votes after prediction date
    num_votes_target = votes[target_votes_start&target_votes_end].groupby(['uid'])['vote'].agg({'votes_num': lambda x: len(x)}).copy().fillna(0)
    #total number of likes given
    num_likes = inters.groupby('uid')['liked'].sum().astype(int)
    num_dislikes = inters.groupby('uid')['disliked'].sum().astype(int)
    #voted at least 5 times before prediction date
    pre_emp = num_votes_pre[num_votes_pre.values>=5].index.unique().values.tolist()
    #voted at least 5 times after prediction date
    post_emp = num_votes_target[num_votes_target.values>=2].index.unique().values.tolist()
    #at least 5 likes/dislikes
    enough_likes = num_likes[(num_likes.values+num_dislikes.values)>=5].index.unique().values.tolist()
    valid_employees = [e for e in pre_emp if e in post_emp and e in enough_likes]
    return valid_employees
    

In [7]:
def filter_training_set(votes_ml, comments_ml, inters_ml, prediction_date, data_collection_ended):
    #votes training set
    votes_train = votes_ml[votes_ml['voteDate']<=pd.to_datetime(prediction_date)]
    #comments training set
    comments_ml = comments_ml[comments_ml.commentDate<=pd.to_datetime(prediction_date)].copy()
    valid_comments = comments_ml.commentId.unique().tolist()
    #only interactions belonging to comments posted during the observation period
    inters_ml = inters_ml[inters_ml['commentId'].map(lambda x: x in valid_comments)].copy()
    
    
    target_votes_start = votes_ml['voteDate']>pd.to_datetime(prediction_date)
    #True before data collection ended
    target_votes_end = votes_ml['voteDate']<=pd.to_datetime(data_collection_ended)

    target_happy = votes_ml[target_votes_start&target_votes_end].groupby(['uid'])['vote'].agg({'votes_num': lambda x: len(x)}).copy().fillna(0)

    target_happy = pd.DataFrame(target_happy.votes_num.map(lambda x: 0 if x<=2 else 1))
    return votes_train, comments_ml, inters_ml, target_happy

In [8]:
def extract_dataset(comments_path='data/anon_comments.csv',
                    inters_path='data/commentInteractions.csv',
                    lastpart_path='data/lastParticipationExists.csv',
                    votes_path='data/votes.csv',
                    prediction_date='6 Feb 2017',
                    data_collection_ended='19 Feb 2017',
                    suffix='',
                    folder='clean_data' ):
    print('Loading data...')
    comments,inters,lastpart,votes = load_data(comments_path=comments_path,
                                               inters_path=inters_path,
                                               lastpart_path=lastpart_path,
                                               votes_path=votes_path)
    print("Filtering by valid UID")
    users_index = create_users_index(prediction_date,
                                     data_collection_ended,
                                     comments,
                                     votes,
                                     lastpart)
    users_index, votes, inters, comments, lastpart, inv_comp = add_UIDs(users_index,
                                                                        votes,
                                                                        inters,
                                                                        comments,
                                                                        lastpart)
    
    valid_employees = users_index.uid.unique()
    
    votes, comments, lastpart, inters = filter_by_uids(votes,
                                                       comments,
                                                       lastpart,
                                                       inters,
                                                       valid_employees)
    print("Filtering interactions...")
    valid_employees = filter_interactions(votes,
                                          comments,
                                          inters,
                                          prediction_date,
                                          data_collection_ended)
    
    votes_ml, comments_ml, lastpart_ml, inters_ml = filter_by_uids(votes,
                                                                   comments,
                                                                   lastpart,
                                                                   inters,
                                                                   valid_employees)
    print("Preparing traing and test sets")
    votes_train, comments_train, inters_train, target_happy = filter_training_set(votes_ml,
                                                                                  comments_ml,
                                                                                  inters_ml,
                                                                                  prediction_date,
                                                                                  data_collection_ended)
    print("Saving data...")
        
    votes_train.to_csv(folder+'/votes_ml_'+suffix+'.csv')
    comments_train.to_csv(folder+'/comments_ml_'+suffix+'.csv')
    inters_train.to_csv(folder+'/interactions_ml_'+suffix+'.csv')
    target_happy.to_csv(folder+'/target_ml_'+suffix+'.csv')
    
    print("Data is cleaned")
    return votes_train, comments_train, inters_train, target_happy

In [12]:
extract_dataset(prediction_date='9 Jan 2017',
                data_collection_ended='22 Jan 2017',suffix='jan17')

Loading data...


  exec(code_obj, self.user_global_ns, self.user_ns)


Filtering by valid UID
Filtering interactions...
Preparing traing and test sets
Saving data...
Data is cleaned


In [9]:
def employee_features(votes,comments,target):
    vote_feats = votes.groupby(['uid'])['vote'].agg({'votes_1': lambda x: len(x[x==1]),
                                                            'votes_2': lambda x: len(x[x==2]),
                                                            'votes_3': lambda x: len(x[x==3]),
                                                            'votes_4': lambda x: len(x[x==4]),
                                                            'votes_mean': lambda x: x.mean(),
                                                            'votes_std': lambda x: x.std(),
                                                           'votes_num': lambda x: len(x),
                                                           }).copy().fillna(0)
    likes_feats = comments.groupby(['uid'])['likes'].agg({
                                              'likes_num': lambda x: len(x),
                                              'likes_mean': lambda x: x.mean(),
                                              'likes_std': lambda x: x.std(),
                                              'likes_sum': lambda x: x.sum(),
                                                           })
    dislikes_feats = comments.groupby(['uid'])['dislikes'].agg({
                                              'dislikes_num': lambda x: len(x),
                                              'dislikes_mean': lambda x: x.mean(),
                                              'dislikes_std': lambda x: x.std(),
                                              'dislikes_sum': lambda x: x.sum(),
                                                           })
    coms_feats = comments.dropna().groupby(['uid'])['comment'].agg({'com_num': lambda x: len(x),
                                                                 'com_mean': lambda x: x.apply(len).mean(),
                                                                 'com_std': lambda x: x.map(len).std(),
                                                                 'com_sum': lambda x: x.map(len).sum()})
    features = pd.merge(vote_feats,likes_feats,left_index=True,right_index=True,how='outer')
    features = pd.merge(features,dislikes_feats,left_index=True,right_index=True,how='outer')
    features = pd.merge(features,coms_feats,left_index=True,right_index=True,how='outer')
    features = pd.merge(features,target,left_index=True,right_on='uid',how='right')#fill users with 0 comments
    E_features = features.copy()
    E_features.rename(columns=dict([(x,'E_'+str(x)) for x in E_features.columns]),inplace=True)
    return features

In [10]:
def link_all(users_index,comments,inter):
    """Returns a graph containing relationships of like/dislike among employees"""
    g = nx.DiGraph()
    #One node for every employee storing its employee and company ids
    for ix,x in users_index.iterrows():
        if not g.has_node(x['uid']):
            c,u = x['uid'].split('_')
            g.add_node(x['uid'],company=c,employee=u)
    #adding links to a graph based on likes/dislikes
    com_ids = comments['commentId'].unique()
    users = comments['uid'].unique().tolist()
    print("linking {} comments from {} users".format(len(com_ids),len(users)))
    for com in com_ids:
        #who frite current comment
        writer = comments[comments['commentId']==com]['uid'].values[0]
        #info about com
        df = inter[inter['commentId']==com]
        #people who disliked the comment
        haters = df[df['liked']==0]['uid'].values.copy().tolist()
        #people who like the comment
        likers = df[df['liked']==1]['uid'].values.copy().tolist()
        for i,u in enumerate(likers):
            if not g.has_edge(u,writer):
                g.add_edge(u,writer,int_sum=1,interactions=1,liked=1,disliked=0)
            else:
                g.edge[u][writer]['interactions'] += 1
                g.edge[u][writer]['int_sum'] += 1
                g.edge[u][writer]['liked'] += 1

        for i,u in enumerate(haters):
            if not g.has_edge(u,writer):
                g.add_edge(u,writer,int_sum=-1,interactions=1,liked=0,disliked=1)
            else:
                g.edge[u][writer]['interactions'] += 1
                g.edge[u][writer]['disliked'] += 1
                g.edge[u][writer]['int_sum'] -= 1
    return g     

def add_info_to_graph(features,G):
    """Adds the information contained in a DataFrame to a networkx graph"""
    for n in G.nodes_iter():
        fs = features.ix[n].to_dict()
        G.add_node(n,**fs)
    return G

def add_rel_likes_metrics(g):
    """Adds infomation to edges about the relative number of likes a given employee gave to another."""
    for src in g.nodes_iter():
        t_rel = 0
        neigh = nx.neighbors(g,src)
        f = 1 if len(neigh)==0 else len(neigh)
        for dst in neigh:
            rel = g.edge[src][dst]['liked']/g.edge[src][dst]['interactions']
            g.edge[src][dst]['rel_agree'] = rel
            t_rel += rel
        g.node[src]['mean_agree'] = t_rel/f

def calculate_metrics(g,metrics,weight=None):
    """Returns a DataFrame containing the metrics of a given graph.
    It will calculate the metrics both in the directed and  undirected version opf the graph"""
    graph_df = pd.DataFrame()
    nw =  '' if weight is None else weight+'_'
    for met in metrics:
        graph_df['G_'+nw+met.__name__] = pd.Series(met(g))
        graph_df['G_'+nw+'w_'+met.__name__] = pd.Series(met(g,weight=weight))
        graph_df['G_'+nw+met.__name__+'_u'] = pd.Series(met(g.to_undirected()))
        graph_df['G_'+nw+'w_'+met.__name__+'_u'] = pd.Series(met(g.to_undirected(),weight=weight))
    #Normalize metrics from 0 to 1
    graph_df = (graph_df-graph_df.min(axis=0))/(graph_df.max(axis=0)-graph_df.min(axis=0))
    return graph_df

from sklearn.decomposition import NMF
def get_clusters(g, weight=None,n_components=2,companyAlias=''):
    """Return a DataFrame containing the NMF clustering information of a given graph"""
    nw =  '' if weight is None else weight+'_'
    #Calculate on directed version of the graph
    model = NMF(n_components=n_components)
    X = nx.adjacency_matrix(g,weight=weight).todense()
    if (X < 0).any():
        X = np.abs(X)#NMF only accepts non-negative values
    communities = model.fit_transform(X)
    cd = pd.DataFrame(index=list(g.nodes_iter()),
                      columns=['G_'+nw+'NMF'+str(i)+'_d' for i in range(1,n_components+1)],
                      data=communities) 
    #Undirected version
    Xd = nx.adjacency_matrix(g.to_undirected(),weight=weight).todense()
    if (Xd < 0).any():
        Xd = np.abs(Xd)
    communities = model.fit_transform(Xd)
    clusters = cd.combine_first(pd.DataFrame(index=cd.index,
                                             columns=['G_'+nw+'NMF'+str(i)+'_u' for i in range(1,n_components+1)]
                                             ,data=communities))
    return clusters 

def calculate_graph_features(G,features):
    add_rel_likes_metrics(G)
    ng = add_info_to_graph(features=features,G=G.copy())
    
    met_funcs = [nx.degree, nx.betweenness_centrality]
    graph_met_rel = calculate_metrics(ng,met_funcs,weight='rel_agree')
    cluster_rel = get_clusters(ng,weight='rel_agree')
    agree_feats = pd.merge(graph_met_rel,cluster_rel,left_index=True,right_index=True,how='outer')
    graph_met_int = calculate_metrics(ng,met_funcs,weight='interactions')
    cluster_int = get_clusters(ng,weight='interactions')
    int_feats = pd.merge(graph_met_int,cluster_int,left_index=True,right_index=True,how='outer')
    graph_feats = pd.merge(int_feats,agree_feats,left_index=True,right_index=True,how='outer')
    #final_features = pd.merge(features,graph_feats,left_index=True,right_index=True,how='outer')
    ng = add_info_to_graph(features=graph_feats,G=ng.copy())
    
    return graph_feats,ng

In [11]:
def emp_graph_features(target, comments, inters, e_features):
    
    G_raw = link_all(target,comments,inters)
    EG_features,g_total = calculate_graph_features(G_raw.copy(),e_features.set_index('uid'))
    return EG_features,g_total

In [12]:
def company_features(votes, comments,target):
    comp_vote_feats = votes.groupby(['comid'])['vote'].agg({'votes_1': lambda x: len(x[x==1]),
                                                        'votes_2': lambda x: len(x[x==2]),
                                                        'votes_3': lambda x: len(x[x==3]),
                                                        'votes_4': lambda x: len(x[x==4]),
                                                        'votes_mean': lambda x: x.mean(),
                                                        'votes_std': lambda x: x.std(),
                                                       'votes_num': lambda x: len(x),
                                                       }).copy()
    comp_likes_feats = comments.groupby(['comid'])['likes'].agg({
                                          'likes_num': lambda x: len(x),
                                          'likes_mean': lambda x: x.mean(),
                                          'likes_std': lambda x: x.std(),
                                          'likes_sum': lambda x: x.sum(),
                                                       })
    comp_dislikes_feats = comments.groupby(['comid'])['dislikes'].agg({
                                              'dislikes_num': lambda x: len(x),
                                              'dislikes_mean': lambda x: x.mean(),
                                              'dislikes_std': lambda x: x.std(),
                                              'dislikes_sum': lambda x: x.sum(),
                                                           })
    comp_coms_feats = comments.dropna().groupby(['comid'])['comment'].agg({'com_num': lambda x: len(x),
                                                                 'com_mean': lambda x: x.apply(len).mean(),
                                                                 'com_std': lambda x: x.map(len).std(),
                                                                 'com_sum': lambda x: x.map(len).sum()})
    comp_features = pd.merge(comp_vote_feats,comp_likes_feats,left_index=True,right_index=True,how='outer')
    comp_features = pd.merge(comp_features,comp_dislikes_feats,left_index=True,right_index=True,how='outer')
    comp_features = pd.merge(comp_features,comp_coms_feats,left_index=True,right_index=True,how='outer')
    comp_df = comp_features.fillna(1)
    C_features_raw = comp_df.applymap(lambda x: 1 if x==0 else x)
    C_features = pd.DataFrame(index=target.uid,columns=C_features_raw.columns)
    for i in range(len(target.uid)):
        company = int(C_features.index.values[i].split('_')[0])
        C_features.iloc[i,:] = C_features_raw.ix[company].values
    return C_features, C_features_raw
    
def calculate_ce_features(C_features_raw, comments, inters, target, features):
    #features['comid'] = features['uid'].map(lambda x: x.split('_')[0] )
    comments['comid'] = comments['uid'].map(lambda x: x.split('_')[0] )
    inters['comid'] = inters['uid'].map(lambda x: x.split('_')[0] )
    target['comid'] = target['uid'].map(lambda x: x.split('_')[0] )
    rel_feats = features.set_index('uid')[C_features_raw.columns.values.tolist()+['comid']].copy()
    companies = C_features_raw.index.values.tolist()
    for comp in companies:
        rel_feats.loc[rel_feats['comid']==int(comp)] = rel_feats.loc[rel_feats['comid']==int(comp)]/C_features_raw.ix[comp]
    #rel_feats.drop('comid',axis=1,inplace=True)

    rel_feats.rename(columns=dict([(x,'CE_'+str(x)) for x in rel_feats.columns]),inplace=True)

    CE_features = rel_feats.copy().fillna(0.)
    return CE_features,companies

In [13]:
def one_company_graph_data(comp,target,comments,inters,features):
    g_comps = {}
    feats = pd.DataFrame()
    comp = str(comp)
    ui = target[target['comid']==comp].copy()
    co = comments[comments['comid']==comp].copy()
    ints = inters[inters['comid']==comp].copy()
    g_c = link_all(ui,co,ints)
    if g_c.node!={}:
        g_comps[comp] = g_c
        _features,g_c = calculate_graph_features(g_c,features[features['CE_comid'].astype(int)==int(comp)].copy())
        feats = _features.copy()
    return feats,g_comps

def ce_graph_features(companies,target,comments,inters,ce_feats):
    

    rgdf,g_comps  = one_company_graph_data(companies[0],target,comments,inters,ce_feats)
    for comp in companies[1:]:
        print("calculating comp : {}".format(comp))
        _rgdf,_g_comps  = one_company_graph_data(comp,target,comments,inters,ce_feats)
        g_comps.update(_g_comps)
        rgdf = pd.concat([rgdf,_rgdf])
    CEG_features = rgdf.rename(columns=dict([(x,'CE'+str(x)) for x in rgdf.columns])).copy()
    return CEG_features

In [14]:
def calculate_features(comments_path='clean_data/comments_ml.csv',
                       inters_path='clean_data/interactions_ml.csv',
                       target_path='clean_data/target_ml.csv',
                       votes_path='clean_data/votes_ml.csv'):
    comments = pd.read_csv(comments_path,parse_dates=[-3]).drop('Unnamed: 0',axis=1)
    votes = pd.read_csv(votes_path,parse_dates=[-4]).drop('Unnamed: 0',axis=1)
    inters = pd.read_csv(inters_path).drop('Unnamed: 0',axis=1)
    target = pd.read_csv(target_path).rename(columns={'votes_num':'target'})
    print("Calculating employee features")
    features = employee_features(votes,comments=comments,target=target)
    features['comid'] = features['uid'].map(lambda x: x.split('_')[0] )
    e_features = features.rename(columns=dict([(x,'E_'+str(x)) for x in features.columns]))

    #return e_features
    print("Calculating employee graph features")
    eg_features,g_total = emp_graph_features(target, comments, inters, features)
    print("Calculating company features")
    c_features, c_features_raw = company_features(votes, comments, target)
    print("Calculating company-employee features")
    ce_features,companies = calculate_ce_features(c_features_raw, comments, inters, target, features)
    print("Calculating company-employee graph features")
    ceg_features = ce_graph_features(companies,target,comments,inters,ce_features)
    return e_features,eg_features,c_features,ce_features,ceg_features

In [50]:
feats = calculate_features()

Calculating employee features
Calculating employee graph features
Calculating company features
Calculating company-employee features
Calculating company-employee graph features
linking 213 comments from 10 users
calculating comp : 4
linking 2762 comments from 87 users
calculating comp : 6
linking 1760 comments from 78 users
calculating comp : 11
linking 3062 comments from 130 users
calculating comp : 12
linking 905 comments from 27 users
calculating comp : 13
linking 474 comments from 14 users
calculating comp : 16
linking 1691 comments from 99 users
calculating comp : 17
linking 789 comments from 39 users
calculating comp : 18
linking 1224 comments from 86 users
calculating comp : 19
linking 233 comments from 7 users
calculating comp : 20
linking 608 comments from 36 users
calculating comp : 21
linking 765 comments from 56 users
calculating comp : 22
linking 487 comments from 23 users
calculating comp : 23
linking 488 comments from 23 users
calculating comp : 24
linking 339 comments f

NameError: name 'eg_features' is not defined

In [17]:
def process_dataset(prediction_date='9 Jan 2017',
                    data_collection_ended='22 Jan 2017',
                    suffix='jan17',
                    folder='clean_data'
                   ):
    extract_dataset(prediction_date=prediction_date,
                    data_collection_ended=data_collection_ended,
                    suffix=suffix,
                    folder=folder
                   )
    e_features, eg_features,c_features,ce_features,ceg_features = calculate_features(comments_path=folder+'/comments_ml_'+suffix+'.csv',
                                                                                   inters_path=folder+'/interactions_ml_'+suffix+'.csv',
                                                                                   target_path=folder+'/target_ml_'+suffix+'.csv',
                                                                                   votes_path=folder+'/votes_ml_'+suffix+'.csv')
    
    e_features.to_csv('features/e_features'+suffix+'.csv')
    eg_features.to_csv('features/eg_features'+suffix+'.csv')
    c_features.to_csv('features/c_features'+suffix+'.csv')
    ce_features.to_csv('features/ce_features'+suffix+'.csv')
    ceg_features.to_csv('features/ceg_features'+suffix+'.csv')

In [18]:
starts = ['14 Nov 2016','5 Dec 2016','19 Dec 2016']
ends = ['4 Dec 2016','18 Dec 2016','1 Jan 2017']
sufixes = ['nov14','dec5','dec19']

In [20]:
for s,e,suf in zip(starts,ends,sufixes):
    process_dataset(prediction_date=s, data_collection_ended=e,suffix=suf)

Loading data...


  from ipykernel import kernelapp as app


Filtering by valid UID
Filtering interactions...
Preparing traing and test sets
Saving data...
Data is cleaned
Calculating employee features
Calculating employee graph features
linking 15628 comments from 1013 users
Calculating company features
Calculating company-employee features
Calculating company-employee graph features
linking 366 comments from 17 users
calculating comp : 1
linking 488 comments from 11 users
calculating comp : 3
linking 97 comments from 5 users
calculating comp : 4
linking 3168 comments from 102 users
calculating comp : 6
linking 1444 comments from 101 users
calculating comp : 8
linking 331 comments from 21 users
calculating comp : 11
linking 2580 comments from 155 users
calculating comp : 12
linking 931 comments from 35 users
calculating comp : 13
linking 370 comments from 16 users
calculating comp : 16
linking 1399 comments from 100 users
calculating comp : 17
linking 824 comments from 53 users
calculating comp : 18
linking 993 comments from 83 users
calculatin

In [85]:
df = pd.read_csv('clean_data/target_ml_'+sufixes[0]+'.csv')

In [86]:
df

Unnamed: 0,uid,votes_num
0,0_18,1
1,0_20,1
2,0_259,1
3,0_27,1
4,0_31,1
5,0_32,1
6,0_33,0
7,0_37,1
8,0_38,1
9,0_39,1


In [90]:
def merge_datas(sufixes=['']):
    e_features = pd.read_csv('features/e_features'+sufixes[0]+'.csv')
    eg_features = pd.read_csv('features/eg_features'+sufixes[0]+'.csv')
    c_features = pd.read_csv('features/c_features'+sufixes[0]+'.csv')
    ce_features = pd.read_csv('features/ce_features'+sufixes[0]+'.csv')
    ceg_features = pd.read_csv('features/ceg_features'+sufixes[0]+'.csv')
    target = pd.read_csv('clean_data/target_ml_'+sufixes[0]+'.csv')
    #return e_features,eg_features,c_features,ce_features,ceg_features
    for suf in sufixes[1:]:
        _target = pd.read_csv('clean_data/target_ml_'+suf+'.csv')
        _target['uid'] = _target['uid']+'_'+suf
        target = pd.concat([target,_target])
        
        
        _e_features =  pd.read_csv('features/e_features'+suf+'.csv')
        _e_features['E_uid'] = _e_features['E_uid']+'_'+suf
        e_features = pd.concat([e_features,_e_features])
        
        _eg_features = pd.read_csv('features/eg_features'+suf+'.csv')
        _eg_features['Unnamed: 0'] = _eg_features['Unnamed: 0']+'_'+suf
        eg_features = pd.concat([eg_features,_eg_features])
        
        _c_features = pd.read_csv('features/c_features'+suf+'.csv')
        _c_features['uid'] = _c_features['uid']+'_'+suf
        c_features = pd.concat([c_features,_c_features])
        
        _ce_features = pd.read_csv('features/ce_features'+suf+'.csv')
        _ce_features['uid'] = _ce_features['uid']+'_'+suf
        ce_features = pd.concat([ce_features,_ce_features])
        
        _ceg_features = pd.read_csv('features/ceg_features'+suf+'.csv')
        _ceg_features['Unnamed: 0'] = _ceg_features['Unnamed: 0']+'_'+suf
        ceg_features = pd.concat([ceg_features,_ceg_features])
    e_features.rename(columns={'E_uid':'uid'},inplace=True)
    eg_features.rename(columns={'Unnamed: 0':'uid'},inplace=True)
    #c_features.rename(columns={'E_uid':'uid'},inplace=True)
    #ce_features.rename(columns={'E_uid':'uid'},inplace=True)
    ceg_features.rename(columns={'Unnamed: 0':'uid'},inplace=True)
    
    features = pd.merge(e_features,eg_features,left_on='uid',right_on='uid')
    features = pd.merge(features,c_features,left_on='uid',right_on='uid')
    features = pd.merge(features,ce_features,left_on='uid',right_on='uid')
    features = pd.merge(features,ceg_features,left_on='uid',right_on='uid')
    return e_features,eg_features,c_features,ce_features,ceg_features,features,target

In [91]:
datas = merge_datas(sufixes)

In [93]:
e_features,eg_features,c_features,ce_features,ceg_features,features,target = datas


In [80]:
suffix = 'all'
e_features.to_csv('features/e_features'+suffix+'.csv')
eg_features.to_csv('features/eg_features'+suffix+'.csv')
c_features.to_csv('features/c_features'+suffix+'.csv')
ce_features.to_csv('features/ce_features'+suffix+'.csv')
ceg_features.to_csv('features/ceg_features'+suffix+'.csv')
features.to_csv('features/all_features'+suffix+'.csv')

In [95]:
target.to_csv('features/target'+suffix+'.csv')