# Initial data exploration

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format='retina'
sns.set()

In [6]:
%%time
df = pd.read_parquet('../data/interim/sample2m.parquet')

CPU times: user 23.5 s, sys: 6.53 s, total: 30.1 s
Wall time: 26.4 s


## Target columns

In [18]:
target_columns = ['reply_timestamp', 'retweet_timestamp','retweet_with_comment_timestamp', 'like_timestamp']

In [64]:
df[target_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2241793 entries, 0 to 2241792
Data columns (total 4 columns):
 #   Column                          Dtype         
---  ------                          -----         
 0   reply_timestamp                 datetime64[ns]
 1   retweet_timestamp               datetime64[ns]
 2   retweet_with_comment_timestamp  datetime64[ns]
 3   like_timestamp                  datetime64[ns]
dtypes: datetime64[ns](4)
memory usage: 68.4 MB


Positive and negative interactions are balanced

In [37]:
df[target_columns].notnull().sum(axis=1).apply(lambda x: x > 0).value_counts(normalize=True)

False    0.502247
True     0.497753
dtype: float64

The different kinds of positive interactions are really unbalanced

In [73]:
for col in target_columns:
    print(col)
    print(df[col].notnull().value_counts(normalize=True)*100)

reply_timestamp
False    97.085815
True      2.914185
Name: reply_timestamp, dtype: float64
retweet_timestamp
False    91.237371
True      8.762629
Name: retweet_timestamp, dtype: float64
retweet_with_comment_timestamp
False    99.303192
True      0.696808
Name: retweet_with_comment_timestamp, dtype: float64
like_timestamp
False    60.261898
True     39.738102
Name: like_timestamp, dtype: float64


## Feature columns

In [62]:
feature_columns = list(set(df.columns)-set(target_columns))

In [63]:
df[feature_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2241793 entries, 0 to 2241792
Data columns (total 20 columns):
 #   Column                              Dtype         
---  ------                              -----         
 0   tweet_type                          category      
 1   engaged_with_user_id                object        
 2   engaging_user_id                    object        
 3   engaging_user_account_creation      datetime64[ns]
 4   present_domains                     object        
 5   engagee_follows_engager             bool          
 6   engaged_with_user_account_creation  datetime64[ns]
 7   engaged_with_user_following_count   int64         
 8   engaging_user_is_verified           bool          
 9   tweet_timestamp                     datetime64[ns]
 10  language                            object        
 11  hashtags                            object        
 12  present_media                       object        
 13  text_ tokens                        object

In [60]:
df[feature_columns].head()

Unnamed: 0,tweet_type,engaged_with_user_id,engaging_user_id,engaging_user_account_creation,present_domains,engagee_follows_engager,engaged_with_user_account_creation,engaged_with_user_following_count,engaging_user_is_verified,tweet_timestamp,language,hashtags,present_media,text_ tokens,tweet_id,engaging_user_follower_count,engaged_with_user_follower_count,engaged_with_user_is_verified,present_links,engaging_user_following_count
0,Quote,E1054B3E0E8E9DA570D817F51A73885E,140434A8754F70323B7BBAE809B65B29,2011-04-26 02:19:20,[3C0DB64B05242E8A7ED51F93785AA091],True,2020-02-29 07:35:34,696,False,2021-02-07 14:16:21,E7F038DE3EAD397AEC9193686C911677,,,"[101, 100, 6247, 3823, 100, 216, 3770, 10827, ...",82858A8A9899B1437BCFC5D97ECED8B8,571,675,False,[24F3875A3FEE1DA4B7BCD346EF4A7066],742
1,TopLevel,8923CE7F418F653A6B8E93F255C21FB4,E022AD787B2BE6ED64DA7660A23F1C7B,2012-12-09 03:19:34,,True,2014-09-04 12:55:09,367,False,2021-02-11 17:05:57,488B32D24BD4BB44172EB981C1BCA6FA,,,"[101, 15384, 48426, 10230, 25470, 16473, 30123...",6F3527024A1B32F0072F284C7D61CE1D,111,453,False,,42
2,TopLevel,6EC14E96E26DCEE96EBC0CD520FF65E2,EAE4AFB8A76F7036655FC33174D963E3,2020-09-10 10:17:40,[0FF02B2C345DC24DF9453DB993999FFE],False,2014-10-17 10:41:42,261,False,2021-02-04 10:36:57,DA13A5C3763C212D9D68FC69102DE5E5,[8F4665495616E63F2F6162B1EA48DD26],[Photo],"[101, 67707, 13520, 21162, 14996, 117, 12537, ...",7E061D1B6C5EECA0EAE12EC115BDB38F,47,36033,True,[9A94C6B87BA0A96CF8932FFC18ADDA2E],899
3,TopLevel,57702B3903145012E8B17BD77000DDAA,4E0E262086AB0338615CBD8C8EA09E8E,2016-05-13 03:46:20,,False,2010-11-24 03:32:07,309,False,2021-02-09 23:10:55,B0FA488F2911701DD8EC5B1EA5E322D8,,,"[101, 38432, 18439, 10139, 15983, 16555, 10198...",764D7AA478DC4258E2144F7AE52958CE,49,278571,False,,692
4,Retweet,E9CB4529426EED1A94EAF4C2CDBA6586,1AA980868659C6424D37F6989E0A75B3,2012-09-07 06:40:45,,True,2020-03-27 21:44:36,947,False,2021-02-10 20:04:20,1F73BB863A39DB62B4A55B7E558DB1E8,,[Photo],"[101, 56898, 137, 24471, 10921, 10738, 10943, ...",462A952971A021886F292669D2B7C740,531,1092,False,,230


### Tweet type