In [1]:
import pandas as pd
import os
from datetime import datetime
import sys
import csv
from db import *
from time_series import *
import awarp_wrapper.awarpWrapper as awarp

In [2]:
df_bots = pd.read_csv("data/egypt_uae.csv", encoding='utf-8', engine='python')

In [3]:
df_bots.columns

Index(['tweetid', 'userid', 'user_display_name', 'user_screen_name',
       'user_reported_location', 'user_profile_description',
       'user_profile_url', 'follower_count', 'following_count',
       'account_creation_date', 'account_language', 'tweet_language',
       'tweet_text', 'tweet_time', 'tweet_client_name', 'in_reply_to_userid',
       'in_reply_to_tweetid', 'quoted_tweet_tweetid', 'is_retweet',
       'retweet_userid', 'retweet_tweetid', 'latitude', 'longitude',
       'quote_count', 'reply_count', 'like_count', 'retweet_count', 'hashtags',
       'urls', 'user_mentions', 'poll_choices'],
      dtype='object')

In [4]:
df_bots = df_bots[["userid", "tweet_time"]]

In [5]:
df_bots["tweet_time_dt"] = df_bots.tweet_time.apply(lambda x: 
                                            datetime.strptime(x, "%Y-%m-%d %H:%M"))

In [6]:
df_bots.drop(columns=["tweet_time"], inplace=True)

In [7]:
df_bots.head()

Unnamed: 0,userid,tweet_time_dt
0,wU7Q3gj92ggQlrKirvuJdGNlB2HqnVgl99JwO5+TOo8=,2017-09-29 18:51:00
1,1VQsJCV6aNMcy0IEPqvcEcbbDcDFWoAl2kfBu3kWdjM=,2016-05-24 11:59:00
2,SIhFG6lNEyDStYbaIYDDeFVG31CKqxwHfJnCwz1iZ78=,2017-09-15 23:22:00
3,2bks1u579bOMvZioiY6cP0eCA0TpbgQeoAu+aNEGNk=,2019-04-30 14:45:00
4,Fz55SPZku0DdGUoR+XI1UlR8ZAYFeVlVMom6BYYfF5I=,2019-04-09 09:47:00


In [8]:
TS = TimeSeries(base_time=df_bots.tweet_time_dt.min(),
                 end_time=df_bots.tweet_time_dt.max())
df_ts = (df_bots.groupby('userid')['tweet_time_dt']
                .apply(lambda x: list(x))
                .reset_index(name='activity'))
df_ts["tweet_count"] = df_ts["activity"].apply(len)
df_ts = df_ts[df_ts.tweet_count>0]
df_ts["ts"] = df_ts["activity"].apply(lambda x: TS.get_encoded_ts(x))
df_ts = df_ts.drop(columns=["tweet_count", "activity"])
df_ts.rename(columns={"userid": "author_id"}, inplace=True)
#df_ts.to_csv("data/df_bot_ts.csv", index=False, encoding='utf-8')

In [9]:
df_ts.head()

Unnamed: 0,author_id,ts
0,+AycM83ak1Mbd9PdCRqAh42ITNj9aplL3CG4D++UcCA=,"[-1, -224588459, 2, -9719, 1, -539, 1, -333899..."
1,+EBSBz71UQdapCg8eqzlmxpyKtVdVzPQEFf1EhvwI=,"[-1, -228660119, 1, -59, 2, -119, 2, -119, 1, ..."
2,+VB52zOk7+u6zBYfAtoLr2t2TbiYke6vTaYYWlODlWg=,"[-1, -139896839, 1, -59, 1, -179, 1, -59, 1, -..."
3,+jNRKZt+OCf1thm+0sH3q1dCjjmQleMHoYWtf0pSbs=,"[-1, -228838259, 1, -33839, 1, -59, 1, -359, 1..."
4,+nRcxZyipmB3MNy25+AH0RyGJCZ+sEKDFayPeIZE=,"[-1, -211957499, 1, -7739, 1, -233159, 1, -144..."


In [10]:
def compute_awarp(df_author_ts):
    '''
    Takes a dataframe with 'author_id' and 'ts' as columnn. 
    The ts column is already encoded for awarp. 
    Write the time series on disk and call a python wrapper 
    for cpp that will read the file and compute the distances. 
    The wrapper will then write the result on disk. The function 
    then read the file and returns a dataframe with 'author_id_x', 
    'author_id_y' and 'd' representing the distance between two 
    authors.
    '''
    to_write = ""
    for x in df_author_ts.ts:
        for y in x:
            to_write += str(y) + ","
        to_write = to_write[:-1]
        to_write += "\n"

    with open("awarp_wrapper/ts.csv", "w") as f:
        f.write(to_write)
    
    # call the wrapper
    f = awarp.DTW()
    f.run()
    
    # read the output file
    df_out = pd.read_csv('awarp_wrapper/out.txt', sep=" ", header=None, 
                     names = ["author_id_x","author_id_y","d"])
    
    # update the author columns with the right authors
    current_idx = 0
    author_list = list(df_author_ts.author_id)
    cur_count = df_author_ts.shape[0]-1
    count = int((df_author_ts.shape[0]*(df_author_ts.shape[0]-1))/2)
    author_left = []
    author_right = []
    for i in range(len(author_list)-1):
        author_left += [author_list[i]]*cur_count
        cur_count-=1
    cur_count = df_author_ts.shape[0]-1
    for i in range(len(author_list)-1):
        for j in range(i+1, len(author_list)):
            author_right.append(author_list[j])
            
    df_out["author_id_x"] = author_left
    df_out["author_id_y"] = author_right
    
    return df_out
    

In [13]:
%%time
df_out = compute_awarp(df_ts.iloc[:20])

CPU times: user 730 ms, sys: 53.9 ms, total: 784 ms
Wall time: 849 ms


In [14]:
df_out

Unnamed: 0,author_id_x,author_id_y,d
0,+AycM83ak1Mbd9PdCRqAh42ITNj9aplL3CG4D++UcCA=,+EBSBz71UQdapCg8eqzlmxpyKtVdVzPQEFf1EhvwI=,22.95650
1,+AycM83ak1Mbd9PdCRqAh42ITNj9aplL3CG4D++UcCA=,+VB52zOk7+u6zBYfAtoLr2t2TbiYke6vTaYYWlODlWg=,28.17800
2,+AycM83ak1Mbd9PdCRqAh42ITNj9aplL3CG4D++UcCA=,+jNRKZt+OCf1thm+0sH3q1dCjjmQleMHoYWtf0pSbs=,23.74870
3,+AycM83ak1Mbd9PdCRqAh42ITNj9aplL3CG4D++UcCA=,+nRcxZyipmB3MNy25+AH0RyGJCZ+sEKDFayPeIZE=,19.87460
4,+AycM83ak1Mbd9PdCRqAh42ITNj9aplL3CG4D++UcCA=,0J4wIcWAZVGdk4CI9GqlSy580GpTxodTKQucIAyZv8=,21.63330
5,+AycM83ak1Mbd9PdCRqAh42ITNj9aplL3CG4D++UcCA=,0OOi4KuKwgTBm58d8iVFdhJmjHWqmAAeem8llKXGo=,25.11970
6,+AycM83ak1Mbd9PdCRqAh42ITNj9aplL3CG4D++UcCA=,0VHlpqV4u9zaTXb1TTnzuEgCeT0ogCXDDMbglMIVEY=,28.44290
7,+AycM83ak1Mbd9PdCRqAh42ITNj9aplL3CG4D++UcCA=,10fIBfuhQ0vFOVRcRoIRdycQxPsbkFMLQzZVcezjY=,21.97730
8,+AycM83ak1Mbd9PdCRqAh42ITNj9aplL3CG4D++UcCA=,10h6HUUh9t0FfaiANsisMHYmXdClPJ3fuIvnGVVdIQ=,31.76480
9,+AycM83ak1Mbd9PdCRqAh42ITNj9aplL3CG4D++UcCA=,1FqPbDe10XO7n+S3s2kIBeYaH0RKon9ublLyd5K1H+M=,29.71530
