# _Experimentation 100k Tweet Sample: Jan. 20, 2020_

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [8]:
# import libraries
import pandas as pd
pd.options.display.max_columns = None
from pathlib import Path
import numpy as np
import random
import os

# Matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

## _Set-up Path_

In [3]:
# get current directory
os.getcwd()

'/notebooks'

In [5]:
# change directory to storage/ru_disinfo (where data for this project is stored)
os.chdir("storage/ru_disinfo")

In [9]:
# assign as Path variable
path = Path(os.getcwd())
print(path)

/storage/ru_disinfo


## _Load in Data_

In [10]:
# load in 100k JSON data from Jan. 19, 2020
df = pd.read_json(path/"100k_tweetsjan19.json", orient="split")

In [21]:
# drop the user_mentions column
df.drop("user_mentions", axis=1, inplace=True)

In [22]:
# check out info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 10 columns):
full_text             100000 non-null object
clean_tweet           100000 non-null object
label                 100000 non-null object
retweet               100000 non-null bool
num_mentions          100000 non-null int64
mentions_usernames    100000 non-null object
num_links             100000 non-null int64
vader_compound        100000 non-null float64
num_hashtags          100000 non-null int64
hashtags_used         100000 non-null object
dtypes: bool(1), float64(1), int64(3), object(5)
memory usage: 7.7+ MB


In [23]:
# check out first few rows
df[:5]

Unnamed: 0,full_text,clean_tweet,label,retweet,num_mentions,mentions_usernames,num_links,vader_compound,num_hashtags,hashtags_used
0,"On October 16th, one supporter will get a chan...",october supporter will chance watch debate wit...,verified,False,0,,1,0.204,0,
1,OMG I’ve been dying to tell you about the new ...,been dying tell about cozy collection pajamas ...,verified,False,1,@skims,2,0.184,0,
2,The #SchumerStandard for filling #SCOTUS vacan...,schumer standard filling scotus vacancies,verified,False,0,,1,0.0,2,"#SchumerStandard,#SCOTUS"
3,RT @alicia_keysbr: When someone says #TeamAlic...,when someone says team alicia strongest alread...,verified,True,1,@alicia_keysbr,1,0.248,1,#TeamAlicia
4,"“Whether it is because of distance, health pro...",whether because distance health problems work ...,verified,False,1,@NYTParenting,1,0.051,0,


## _Focus: Hashtags_

In [13]:
# see how we might be able to find all hastags in a given tweet
df["full_text"][:20].str.findall("#[\w]*")

0                              []
1                              []
2     [#SchumerStandard, #SCOTUS]
3                   [#TeamAlicia]
4                              []
5                              []
6                              []
7                          [#AGT]
8                              []
9                              []
10                             []
11                             []
12                             []
13                             []
14                             []
15                             []
16                             []
17                             []
18           [#dunkin, #worktyme]
19                             []
Name: full_text, dtype: object

In [15]:
# create new column with number of hashtags
df["num_hashtags"] = df["full_text"].str.findall("#[\w]*").apply(lambda x: len(x))

In [19]:
# create new column with string value of the hashtags
df["hashtags_used"] = df["full_text"].str.findall("#[\w]*").apply(lambda x: ",".join([hashtag for hashtag in x]))

## _Uppercase, Character Count, Word Count_

In [27]:
df["full_text"][:10]

0    On October 16th, one supporter will get a chan...
1    OMG I’ve been dying to tell you about the new ...
2    The #SchumerStandard for filling #SCOTUS vacan...
3    RT @alicia_keysbr: When someone says #TeamAlic...
4    “Whether it is because of distance, health pro...
5    @AndyYork8 What you are saying is "I was askin...
6    QB ➡️ QB respect\n\n@DangeRussWilson and @desh...
7    “Men lie, Women lie, numbers don’t” Over this ...
8    Momentum &amp; energy transfer. Elastic &amp; ...
9    RT @TheSource: Bernie Sanders Walks out to You...
Name: full_text, dtype: object

In [32]:
df["full_text"][:20].apply(lambda x: [word for word in x.split() if word.isupper()])

0                        []
1     [OMG, I, I, 9AM, PST]
2                 [#SCOTUS]
3                      [RT]
4                        []
5                      ["I]
6                  [QB, QB]
7                    [#AGT]
8                        []
9                      [RT]
10              [$750M, US]
11                   [AIDS]
12                   [I, I]
13                       []
14                       []
15                     [RT]
16                 [AG,, I]
17              [US, "I, I]
18                       []
19                       []
Name: full_text, dtype: object

In [45]:
# remove the RTs (characters indicating RT
df["full_text"][:20].map(lambda x: x.replace(r"RT|", "").strip()).apply(lambda x: [word for word in x.split() if word.isupper()])

0                        []
1     [OMG, I, I, 9AM, PST]
2                 [#SCOTUS]
3                        []
4                        []
5                      ["I]
6                  [QB, QB]
7                    [#AGT]
8                        []
9                        []
10              [$750M, US]
11                   [AIDS]
12                   [I, I]
13                       []
14                       []
15                       []
16                 [AG,, I]
17              [US, "I, I]
18                       []
19                       []
Name: full_text, dtype: object

In [24]:
df[:5]

Unnamed: 0,full_text,clean_tweet,label,retweet,num_mentions,mentions_usernames,num_links,vader_compound,num_hashtags,hashtags_used
0,"On October 16th, one supporter will get a chan...",october supporter will chance watch debate wit...,verified,False,0,,1,0.204,0,
1,OMG I’ve been dying to tell you about the new ...,been dying tell about cozy collection pajamas ...,verified,False,1,@skims,2,0.184,0,
2,The #SchumerStandard for filling #SCOTUS vacan...,schumer standard filling scotus vacancies,verified,False,0,,1,0.0,2,"#SchumerStandard,#SCOTUS"
3,RT @alicia_keysbr: When someone says #TeamAlic...,when someone says team alicia strongest alread...,verified,True,1,@alicia_keysbr,1,0.248,1,#TeamAlicia
4,"“Whether it is because of distance, health pro...",whether because distance health problems work ...,verified,False,1,@NYTParenting,1,0.051,0,


In [57]:
import string
import spacy

# grab a set of punctuations
puncs = string.punctuation
nlp = spacy.load("en_core_web_md")

In [59]:
tweet = df["full_text"][3:4].iloc[0]
print(type(tweet), tweet)

<class 'str'> RT @alicia_keysbr: When someone says #TeamAlicia is the strongest but we already know that 💁🏽 https://t.co/7MDBKkfYJo


In [123]:
import re

def uppercase(text):
    """
    Function that returns number of uppercase letters in tweet.
    """
    # strip Retweet symbol
    text = re.sub(r"RT|#", "", text).strip()
    # use spaCy to remove punctuation
    doc = nlp(text)
    # clean out punctuation
    text = " ".join([token.text for token in doc if not token.is_punct])
    # return number of words that start with uppercase
    return [word for word in text.split() if word[0].isupper()]

In [124]:
uppercase(tweet)

['When', 'TeamAlicia']

In [131]:
df["full_text"][:20].apply(lambda x: len(uppercase(x)))

0      6
1     10
2      3
3      2
4      3
5      2
6      4
7      5
8      7
9     11
10     3
11     6
12     4
13     1
14     1
15     4
16     9
17     6
18     3
19     4
Name: full_text, dtype: int64

In [133]:
from tqdm.notebook import tqdm
tqdm.pandas()

# create new column indicating how many uppercase words there were within text
df["uppercase_words"] = df["full_text"].progress_apply(lambda x: len(uppercase(x)))

HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [200]:
def characters(raw_text):
    """
    Function that returns number of characters in tweet.
    """
    # strip Retweet symbol (as it isn't included in character count)
    text = re.sub(r"RT", "", raw_text)
    # replace links with 23 underscores which serve as placeholders (as all links are 23 characters long, per Twitter)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '_' * 23, text, flags=re.MULTILINE)
    
    return text 

In [177]:
# create new column that counts number of characters in Tweet
df["full_text"][:20].progress_apply(lambda x: len(characters(x)))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




0     127
1     228
2      75
3     115
4     260
5     132
6     146
7     232
8     133
9     115
10     92
11    273
12    165
13     39
14     37
15    138
16    297
17    243
18     90
19    179
Name: full_text, dtype: int64

In [179]:
len(df["full_text"][16:17].iloc[0])

297

In [193]:
text = "The percentage of observations whose character length is greater than 280:"

print(text, (sum(df["full_text"].apply(lambda x: len(characters(x))) > 280) / len(df)) * 100, "%")

The percentage of observations whose character length is greater than 280: 1.281 %


In [194]:
# create new column that counts number of characters in Tweet
df["num_characters"] = df["full_text"].progress_apply(lambda x: len(characters(x)))

HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [216]:
def strip_emoji(text):
    RE_EMOJI = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
    return RE_EMOJI.sub(r'', text)

In [238]:
#!pip install demoji
import demoji
demoji.download_codes()

[33mDownloading emoji data ...[0m
[92m... OK[0m (Got response in 0.14 seconds)
[33mWriting emoji data to /root/.demoji/codes.json ...[0m
[92m... OK[0m


In [259]:
import numpy as np

def words(text):
    """
    Function that returns number of words in tweet. (strips RT, digits, mentions)
    """
    # strip Retweet symbol
    text = re.sub(r"RT|&amp|\d+th|@[\w]*|http\S+|\n\n|\n", "", text).strip()
    # use spaCy to remove punctuation
    doc = nlp(text)
    # clean out punctuation
    text = " ".join([token.text for token in doc if not token.is_punct])
    # strip emojis from text
    text = demoji.replace(text, "").strip()
    # return average length of words
    return round(np.mean([len(word) for word in text.split()]), 4)

In [260]:
df[3:4]["full_text"].apply(words).iloc[0]

4.9167

In [267]:
# create new column with the average length of word in text
df["avg_lenwords"] = df["full_text"].progress_apply(words)

HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [268]:
df[:5]

Unnamed: 0,full_text,clean_tweet,label,retweet,num_mentions,mentions_usernames,num_links,vader_compound,num_hashtags,hashtags_used,uppercase_words,num_characters,avg_lenwords
0,"On October 16th, one supporter will get a chan...",october supporter will chance watch debate wit...,verified,False,0,,1,0.204,0,,6,127,4.3333
1,OMG I’ve been dying to tell you about the new ...,been dying tell about cozy collection pajamas ...,verified,False,1,@skims,2,0.184,0,,10,228,4.0
2,The #SchumerStandard for filling #SCOTUS vacan...,schumer standard filling scotus vacancies,verified,False,0,,1,0.0,2,"#SchumerStandard,#SCOTUS",3,75,7.1667
3,RT @alicia_keysbr: When someone says #TeamAlic...,when someone says team alicia strongest alread...,verified,True,1,@alicia_keysbr,1,0.248,1,#TeamAlicia,2,115,4.9167
4,"“Whether it is because of distance, health pro...",whether because distance health problems work ...,verified,False,1,@NYTParenting,1,0.051,0,,3,260,5.2


In [296]:
def spacy_analyze(text, pos_string):
    """
    Uses spaCy to retrieve data related to an input part-of-speect tag within a tweet.
    """
    # strip Retweet and & symbols
    text = re.sub(r"RT|&amp|http\S+", "", text).strip()
    # create Doc object with tweet
    doc = nlp(text)
    return len([(token.text, token.pos_) for token in doc if token.pos_ == pos_string])

In [298]:
# create new column showing number of nouns in tweet
df["num_nouns"] = df["full_text"].progress_apply(lambda x: spacy_analyze(x, "NOUN"))

HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [299]:
df[:4]

Unnamed: 0,full_text,clean_tweet,label,retweet,num_mentions,mentions_usernames,num_links,vader_compound,num_hashtags,hashtags_used,uppercase_words,num_characters,avg_lenwords,num_nouns
0,"On October 16th, one supporter will get a chan...",october supporter will chance watch debate wit...,verified,False,0,,1,0.204,0,,6,127,4.3333,4
1,OMG I’ve been dying to tell you about the new ...,been dying tell about cozy collection pajamas ...,verified,False,1,@skims,2,0.184,0,,10,228,4.0,8
2,The #SchumerStandard for filling #SCOTUS vacan...,schumer standard filling scotus vacancies,verified,False,0,,1,0.0,2,"#SchumerStandard,#SCOTUS",3,75,7.1667,1
3,RT @alicia_keysbr: When someone says #TeamAlic...,when someone says team alicia strongest alread...,verified,True,1,@alicia_keysbr,1,0.248,1,#TeamAlicia,2,115,4.9167,1


In [301]:
len(list(df.columns))

14

In [303]:
# reorganize columns
df = df[
    ["full_text",
     "clean_tweet",
     "retweet",
     "num_mentions",
     "mentions_usernames",
     "num_links",
     "num_hashtags",
     "hashtags_used",
     "uppercase_words",
     "num_characters",
     "avg_lenwords", 
     "num_nouns",
     "vader_compound",
     "label"
    ]
       ].copy()

In [305]:
df.to_json("100k_tweetsjan20.json", orient="split")