Purpose of notebook: downloading datasets from scratch (assuming that the original dataset is unavailable)

In [5]:
from datetime import date, datetime
import ujson as json
import os
import sys

from dateutil.parser import parse
import pandas as pd
from scipy.stats import shapiro, ttest_ind
from statsmodels.stats.weightstats import ztest

sys.path.append(os.path.expanduser("~/GitHub/EvenTDT"))

from eventdt import twitter

corpus = 'founta' # options: founta, waseem

labels = os.path.expanduser(f"~/DATA/c5-evaluation/{corpus}/{ corpus }-labels.csv")
downloaded = os.path.expanduser(f"~/DATA/c5-evaluation/{corpus}/{ corpus }-downloaded.json")

In [6]:
%%time
df = pd.read_csv(labels)
df = df.set_index('tweet_id', drop=True)
df

CPU times: user 50.1 ms, sys: 5.82 ms, total: 56 ms
Wall time: 58.8 ms


Unnamed: 0_level_0,label
tweet_id,Unnamed: 1_level_1
849667487180259329,abusive
850490912954351616,abusive
848791766853668864,abusive
848306464892604416,abusive
850010509969465344,normal
...,...
848922176128376832,hateful
849405078956109829,abusive
847981251143254016,normal
850180983253073920,abusive


In [7]:
%%time
available = { }
with open(downloaded, 'r') as dl:
    for tweet in dl:
        tweet = json.loads(tweet)
        available[tweet['id_str']] = { 'id': tweet['id_str'], 'text': twitter.full_text(tweet),
                                       'timestamp': twitter.extract_timestamp(tweet) }

CPU times: user 6.86 s, sys: 68.5 ms, total: 6.93 s
Wall time: 6.92 s


In [8]:
%%time
df['available'] = df.index.isin(list(available))

CPU times: user 16.8 ms, sys: 130 µs, total: 16.9 ms
Wall time: 16.9 ms


In [9]:
df.groupby([ 'available' ]).count()['label'] / len(df)

available
False    0.46251
True     0.53749
Name: label, dtype: float64

In [10]:
%%time
df['text'] = df.index.map(lambda tid: available.get(str(tid), {}).get('text', ''))
df.tail()

CPU times: user 56.8 ms, sys: 114 µs, total: 56.9 ms
Wall time: 56.2 ms


Unnamed: 0_level_0,label,available,text
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
848922176128376832,hateful,False,
849405078956109829,abusive,False,
847981251143254016,normal,False,
850180983253073920,abusive,False,
849515175254405120,abusive,True,Swear to god it's the worst shit ever https://...


In [11]:
%%time
df['timestamp'] = df.index.map(lambda tid: available.get(str(tid), {}).get('timestamp', 0))
df['datetime'] = df.timestamp.map(lambda timestamp: datetime.fromtimestamp(timestamp))
df['datetime'] = pd.to_datetime(df['datetime']).dt.date
min(df[df['datetime'] > date(1970, 1, 1)].datetime), max(df[df['datetime'] > date(1970, 1, 1)].datetime)

CPU times: user 193 ms, sys: 3.81 ms, total: 196 ms
Wall time: 197 ms


(datetime.date(2017, 3, 30), datetime.date(2017, 4, 9))

In [12]:
a = df.groupby([ 'label', 'available' ])['available'].count()
a.div(a.sum(level=0), level=0)

label    available
abusive  False        0.656545
         True         0.343455
hateful  False        0.580841
         True         0.419159
normal   False        0.358951
         True         0.641049
spam     False        0.443882
         True         0.556118
Name: available, dtype: float64

In [13]:
# looks like many 'spam' tweets are actually opinions
df[(df.label == 'spam') & (df.available)].sample(10)

Unnamed: 0_level_0,label,available,text,timestamp,datetime
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
848975494141931521,spam,True,Aquaphor Baby Wash &amp; Shampoo 8.4 fl. oz. (...,1491247000.0,2017-04-03
848988345497604097,spam,True,Fall in love with this 4 BD/ 3 BA in ACCOKEEK....,1491250000.0,2017-04-03
850659976968327169,spam,True,I added a video to a @YouTube playlist https:/...,1491648000.0,2017-04-08
848531321521614850,spam,True,And he had to reach across my body to shove hi...,1491141000.0,2017-04-02
848414719895625728,spam,True,Wine House is a Winery #Free #HTML #Template f...,1491113000.0,2017-04-02
849331045304610817,spam,True,New art from @EarthHeiress celebrates the Leus...,1491331000.0,2017-04-04
850344946993315840,spam,True,I've just entered the @unitlostgaming @Dell In...,1491573000.0,2017-04-07
850415671326453766,spam,True,The ChicagoFishingSchool Daily is out! https:/...,1491590000.0,2017-04-07
848876147865772032,spam,True,@sebabecks we would LOVE the support for our n...,1491223000.0,2017-04-03
847742767233093633,spam,True,Keep your healthy eating on track with these h...,1490953000.0,2017-03-31


In [14]:
df.groupby(['label'])['label'].count() / len(df) # original tweet share
df[df.available].groupby(['label'])['label'].count() / len(df[df.available]) # available tweet share

label
abusive    0.173114
hateful    0.038664
normal     0.642829
spam       0.145393
Name: label, dtype: float64

In [15]:
%%time
df['noise'] = df.label.isin([ 'abusive', 'hateful', 'spam' ]) # for founta
df['noise'] = df.label.isin([ 'racism', 'sexism' ]) # for waseem
a = df.groupby([ 'noise', 'available' ])['available'].count()
a.div(a.sum(level=0), level=0)

CPU times: user 14.2 ms, sys: 358 µs, total: 14.6 ms
Wall time: 13.7 ms


noise  available
False  False        0.46251
       True         0.53749
Name: available, dtype: float64