<a href="https://colab.research.google.com/github/MAL-TO/recsys-2021/blob/feature%2Fadd-colab-files/RecSys2021ChallengeEDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the sampled data

## Mount Google Drive

In [138]:
from google.colab import drive

In [139]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Mount git repository hosted on your google Drive (optional)

In [140]:
%cd /content/drive/MyDrive/Universita/MALTO/recsys-2021/colab/

/content/drive/MyDrive/Universita/MALTO/recsys-2021/colab


In [141]:
!git status

On branch feature/add-colab-files
Your branch is up to date with 'origin/feature/add-colab-files'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   RecSys2021ChallengeEDA.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!git commit -am "Initial implementation of Logistic Regression"

## Copy sampled data to local disk

This assumes that you have added the 2M rows sampled data to your Google Drive

In [25]:
!cp /content/drive/MyDrive/sample2m.parquet /content/sample2m.parquet

We expect to have a file called `sample2m` of about `1.1GB`.

In [26]:
!ls -alh /content

total 528M
drwxr-xr-x 1 root root 4.0K Apr 10 15:58 .
drwxr-xr-x 1 root root 4.0K Apr 10 14:18 ..
drwxr-xr-x 4 root root 4.0K Apr  7 13:35 .config
drwx------ 5 root root 4.0K Apr 10 14:22 drive
-rw------- 1 root root 528M Apr 10 15:58 sample2m.parquet
drwxr-xr-x 1 root root 4.0K Apr  7 13:36 sample_data


## Read data

In [8]:
import pandas as pd
import numpy as np

In [29]:
%%time
df = pd.read_parquet('/content/sample2m.parquet');

CPU times: user 21.4 s, sys: 3.58 s, total: 25 s
Wall time: 21.9 s


# Data Analysis

## DataFrame statistics

In [30]:
df.head(1)

Unnamed: 0,text_ tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,"[101, 100, 6247, 3823, 100, 216, 3770, 10827, ...",,82858A8A9899B1437BCFC5D97ECED8B8,,[24F3875A3FEE1DA4B7BCD346EF4A7066],[3C0DB64B05242E8A7ED51F93785AA091],Quote,E7F038DE3EAD397AEC9193686C911677,2021-02-07 14:16:21,E1054B3E0E8E9DA570D817F51A73885E,675,696,False,2020-02-29 07:35:34,140434A8754F70323B7BBAE809B65B29,571,742,False,2011-04-26 02:19:20,True,NaT,NaT,NaT,2021-02-07 14:21:10


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2241793 entries, 0 to 2241792
Data columns (total 24 columns):
 #   Column                              Dtype         
---  ------                              -----         
 0   text_ tokens                        object        
 1   hashtags                            object        
 2   tweet_id                            object        
 3   present_media                       object        
 4   present_links                       object        
 5   present_domains                     object        
 6   tweet_type                          category      
 7   language                            object        
 8   tweet_timestamp                     datetime64[ns]
 9   engaged_with_user_id                object        
 10  engaged_with_user_follower_count    int64         
 11  engaged_with_user_following_count   int64         
 12  engaged_with_user_is_verified       bool          
 13  engaged_with_user_account_creation  dateti

## Drop unused columns

In [32]:
df.drop(columns=['text_ tokens', 'hashtags', 'present_media', 'present_links', 'present_domains'], inplace=True)

## Binary encode target columns

In [34]:
target_cols = [
  "reply_timestamp",
  "retweet_timestamp",
  "retweet_with_comment_timestamp",
  "like_timestamp"
]

In [35]:
df[target_cols] = df[target_cols].notnull()

In [36]:
df.head()

Unnamed: 0,tweet_id,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,82858A8A9899B1437BCFC5D97ECED8B8,Quote,E7F038DE3EAD397AEC9193686C911677,2021-02-07 14:16:21,E1054B3E0E8E9DA570D817F51A73885E,675,696,False,2020-02-29 07:35:34,140434A8754F70323B7BBAE809B65B29,571,742,False,2011-04-26 02:19:20,True,False,False,False,True
1,6F3527024A1B32F0072F284C7D61CE1D,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,2021-02-11 17:05:57,8923CE7F418F653A6B8E93F255C21FB4,453,367,False,2014-09-04 12:55:09,E022AD787B2BE6ED64DA7660A23F1C7B,111,42,False,2012-12-09 03:19:34,True,False,False,False,True
2,7E061D1B6C5EECA0EAE12EC115BDB38F,TopLevel,DA13A5C3763C212D9D68FC69102DE5E5,2021-02-04 10:36:57,6EC14E96E26DCEE96EBC0CD520FF65E2,36033,261,True,2014-10-17 10:41:42,EAE4AFB8A76F7036655FC33174D963E3,47,899,False,2020-09-10 10:17:40,False,False,False,False,False
3,764D7AA478DC4258E2144F7AE52958CE,TopLevel,B0FA488F2911701DD8EC5B1EA5E322D8,2021-02-09 23:10:55,57702B3903145012E8B17BD77000DDAA,278571,309,False,2010-11-24 03:32:07,4E0E262086AB0338615CBD8C8EA09E8E,49,692,False,2016-05-13 03:46:20,False,False,False,False,True
4,462A952971A021886F292669D2B7C740,Retweet,1F73BB863A39DB62B4A55B7E558DB1E8,2021-02-10 20:04:20,E9CB4529426EED1A94EAF4C2CDBA6586,1092,947,False,2020-03-27 21:44:36,1AA980868659C6424D37F6989E0A75B3,531,230,False,2012-09-07 06:40:45,True,False,False,False,False


## Target columns statistics

In [42]:
target_cols = ["retweet_timestamp", "reply_timestamp", "like_timestamp",
               "retweet_with_comment_timestamp",]
df[target_cols].sum(axis=1).value_counts()

0    1125933
1    1065104
2      49160
3       1571
4         25
dtype: int64

## Convert to numpy

In [103]:
Y = df[target_cols].to_numpy().astype(np.float)
Y.shape, Y.nbytes / 1024**2

((2241793, 4), 68.41409301757812)

## Extract numerical features

In [101]:
X_numerical_categorical = df.select_dtypes(['number']).to_numpy().astype(np.float)

In [102]:
X_numerical_categorical.shape, X_numerical_categorical.nbytes / 1024**2

((2241793, 4), 68.41409301757812)

# Model

## Splitting train and test

In [105]:
from sklearn.model_selection import train_test_split

In [106]:
X_train, X_test, Y_train, Y_test = train_test_split(X_numerical_categorical, Y)

In [107]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((1681344, 4), (560449, 4), (1681344, 4), (560449, 4))

## Logistic Regression Model

In [108]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor

In [109]:
%%time
clf = MultiOutputRegressor(LogisticRegression(), n_jobs=-1).fit(X_train, Y_train)

CPU times: user 483 ms, sys: 170 ms, total: 653 ms
Wall time: 1min 9s


In [137]:
for i, target_col in enumerate(target_cols):
  ground_truth = Y_test[:, i]
  predictions = clf.predict(X_test)[:, i]

  ap = average_precision_score(ground_truth, predictions)
  rce = compute_rce(predictions, ground_truth)
  print(f'{target_col:30} AP {ap:.4f}, RE {rce:.4f}')

retweet_timestamp              AP 0.0882, RE -920.9957
reply_timestamp                AP 0.0292, RE -664.4993
like_timestamp                 AP 0.3970, RE -1940.9649
retweet_with_comment_timestamp AP 0.0069, RE -478.7192


# Compute metrics

## Random baseline

The numbers should look something like this:
```
User 	Method 	AP Retweet 	RCE Retweet 	AP Reply 	RCE Reply 	AP Like 	RCE Like 	AP RT with comment 	RCE RT with comment 	Time Taken 	Overall Score

4 	alykhantejani 	rand_predictions 	0.0812 	-254.8419 	0.0233 	-804.2548 	0.3989 	-48.6696 	0.0058 	-2698.8520 	0 hours 	10
```

In [None]:
from sklearn.metrics import average_precision_score, log_loss

def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [136]:
for i, target_col in enumerate(target_cols):
  ground_truth = Y_test[:, i]
  predictions = np.random.rand(*ground_truth.shape)

  ap = average_precision_score(ground_truth, predictions)
  rce = compute_rce(predictions, ground_truth)
  print(f'{target_col:30} AP {ap:.4f}, RE {rce:.4f}')

retweet_timestamp              AP 0.0882, RE -234.8539
reply_timestamp                AP 0.0289, RE -657.1957
like_timestamp                 AP 0.3963, RE -48.9426
retweet_with_comment_timestamp AP 0.0071, RE -2314.2657
