In [3]:
import pandas as pd
pd.set_option("display.max_columns", None)

# load in data
df = pd.read_json("json-data/no_retweets.json", orient="split")

In [4]:
df.head()

Unnamed: 0,id_str,screen_name,created_at,lang,source,retweet_count,favorite_count,full_text
0,1209319045411028992,NBCNews,2019-12-24 03:45:03,en,SocialFlow,12,28,A North Carolina man is accused of using eye d...
1,1209315792657043456,NBCNews,2019-12-24 03:32:07,en,SocialFlow,4,16,11 best gifts and gadgets for home cooks. http...
2,1209308993459572736,NBCNews,2019-12-24 03:05:06,en,SocialFlow,14,61,A woman upset that KFC got sandwich wrong call...
3,1209300423456280576,NBCNews,2019-12-24 02:31:03,en,SocialFlow,22,50,California is taking the lead in helping stude...
4,1209297409316196352,NBCNews,2019-12-24 02:19:04,en,SocialFlow,26,106,Lizzo responds to social commentator who says ...


In [5]:
# create a subset of the data to experiment with
subset = df.sample(n=1000, random_state=1)

In [6]:
# check out first few rows
subset.head()

Unnamed: 0,id_str,screen_name,created_at,lang,source,retweet_count,favorite_count,full_text
148187,1067419455158931456,HillaryClinton,2018-11-27 14:06:44,en,Twitter Web Client,381,973,"The National Immigrant Justice Center, or @NIJ..."
12048,1190540505404325888,Google,2019-11-02 08:05:50,en,Conversocial,0,0,@FizanKhan1 Hi Fizan. This guide may help: htt...
167414,562080196489404416,neiltyson,2015-02-02 02:48:56,en,TweetDeck,3621,5050,"A 50-yd field goal, in the University of Phoen..."
192182,1202422229256097792,BuzzFeed,2019-12-05 02:59:34,en,PubHub by BuzzFeed,51,622,"Emma Stone Just Got Engaged To Dave McCary, An..."
122414,1128314104853278720,nyknicks,2019-05-14 15:00:20,en,Spredfast,624,2279,New York or Nowhere vibes. https://t.co/WW41xo...


In [7]:
# get background information on columns
subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 148187 to 11077
Data columns (total 8 columns):
id_str            1000 non-null int64
screen_name       1000 non-null object
created_at        1000 non-null datetime64[ns]
lang              1000 non-null object
source            1000 non-null object
retweet_count     1000 non-null int64
favorite_count    1000 non-null int64
full_text         1000 non-null object
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 70.3+ KB


In [8]:
# reset index so it is easier to work with
subset.reset_index(drop=True, inplace=True)

## _Cleaning Tweet Text_

In [9]:
# check for \n, indicating new line
len(subset[subset["full_text"].str.contains("\n")])

177

In [10]:
# extract five observations that contain \n for verification
verification = subset[subset["full_text"].str.contains("\n")][:5]

verification[:3]

Unnamed: 0,id_str,screen_name,created_at,lang,source,retweet_count,favorite_count,full_text
24,1171586555066142720,MLBNetwork,2019-09-11 00:49:36,en,Twitter Media Studio,19,138,Gavin Lux just hit career homer #1!\n\nEarlier...
26,1198795835288543232,SportsCenter,2019-11-25 02:49:34,en,Spredfast app,474,4363,DEEBO'S TOO QUICK ⚡️\n\n(via @NFL)\nhttps://t....
35,1194416842615345152,NASA,2019-11-13 00:49:01,en,Sprinklr,274,1823,Have plans on Dec. 9-10? 📅\n\nWe’re inviting s...


In [11]:
# focus on just full_text column
for obs in verification["full_text"]:
    print(obs, "\n", "-" * 30)

Gavin Lux just hit career homer #1!

Earlier today, the @Dodgers rookie chatted with @AlexaDatt and @ScottBraun on #TheRundown: https://t.co/kuNtW8tn0O https://t.co/b3UW1ZzPzI 
 ------------------------------
DEEBO'S TOO QUICK ⚡️

(via @NFL)
https://t.co/GNHFzCaKdZ 
 ------------------------------
Have plans on Dec. 9-10? 📅

We’re inviting social media users to apply to visit our rocket factory in New Orleans, see the @NASA_SLS rocket core stage assembled with all four RS-25 engines and learn about our #Artemis program to explore the Moon. Details: https://t.co/5IswTx1ifB https://t.co/hYKnDnwKU3 
 ------------------------------
What do TIE fighters and @NASA_Dawn have in common? Ion engines! 🛰️💨

In this #Maythe4th lesson, students apply Newton's Laws of Motion to model the velocity and distance traveled by spacecraft in our solar system... or in a galaxy far, far away 🌌: https://t.co/p2T2lRG0Cw https://t.co/sW99b0mwMu 
 ------------------------------
Pete Buttigieg's fast rise in rece

From above, we can see that these observations represent a double `\n`, i.e., they represent a double newlines. So we'll see what replacing them looks like below.

In [12]:
# see what happens when we replace \n with a space
for text in verification["full_text"].str.replace("\n\n", " "):
    print(text, "\n")

Gavin Lux just hit career homer #1! Earlier today, the @Dodgers rookie chatted with @AlexaDatt and @ScottBraun on #TheRundown: https://t.co/kuNtW8tn0O https://t.co/b3UW1ZzPzI 

DEEBO'S TOO QUICK ⚡️ (via @NFL)
https://t.co/GNHFzCaKdZ 

Have plans on Dec. 9-10? 📅 We’re inviting social media users to apply to visit our rocket factory in New Orleans, see the @NASA_SLS rocket core stage assembled with all four RS-25 engines and learn about our #Artemis program to explore the Moon. Details: https://t.co/5IswTx1ifB https://t.co/hYKnDnwKU3 

What do TIE fighters and @NASA_Dawn have in common? Ion engines! 🛰️💨 In this #Maythe4th lesson, students apply Newton's Laws of Motion to model the velocity and distance traveled by spacecraft in our solar system... or in a galaxy far, far away 🌌: https://t.co/p2T2lRG0Cw https://t.co/sW99b0mwMu 

Pete Buttigieg's fast rise in recent early-state polls has come with new scrutiny and made him a big target in tomorrow's debate  https://t.co/VDywLbbtTR 



In [18]:
# have 177 observations that contain \n\n, let's clean this up first
subset["full_text"] = subset["full_text"].str.replace("\n\n", " ").copy()

In [19]:
# check for \n\n, indicating new line
len(subset[subset["full_text"].str.contains("\n\n")])

0

In [20]:
# check for observations with only a single \n
len(subset[subset["full_text"].str.contains("\n")])

80

We have `80` observations that contain a single `\n`. Let's address these similar to how we cleaned up the double `\n`, with `str.replace`.

In [21]:
# clean up observations that contain one \n
subset["full_text"] = subset["full_text"].str.replace("\n", " ").copy()

In [22]:
# check to see if there are any observations with \n
len(subset[subset["full_text"].str.contains("\n")])

0

In [23]:
# install tweet-preprocessor library to help with Tweet text cleaning
#!pip install tweet-preprocessor

In [24]:
import preprocessor as p

In [25]:
# text we'll use for preprocessor example, contains user mentions, hashtags and links
subset["full_text"][24]

'Gavin Lux just hit career homer #1! Earlier today, the @Dodgers rookie chatted with @AlexaDatt and @ScottBraun on #TheRundown: https://t.co/kuNtW8tn0O https://t.co/b3UW1ZzPzI'

In [26]:
# example of how tweet-preprocessor cleans a text
p.clean(subset["full_text"][24])

'Gavin Lux just hit career homer ! Earlier today, the rookie chatted with and on :'

From the comparison above, we can see that perhaps, as is, it cleans a little too much of the tweet. However, we can change the settings so that it doesn't eliminate the user mentions or hashtags (which often are used to add meaning to a tweet). However, the link is not all that important in regards to the text; additionally, because tweets are shortened using Twitter's [t.co service](https://help.twitter.com/en/using-twitter/url-shortener), every link is exactly 23 characters in length. 

So, we're going to keep user mentions and hashtags and eliminate links.

In [55]:
# only eliminate URLs
p.set_options(p.OPT.URL, p.OPT.NUMBER)

In [56]:
# example of how tweet-preprocessor now cleans a text after setting option
p.clean(subset["full_text"][24]).replace(":", "").replace("@", "").replace("#", "")

'Gavin Lux just hit career homer 1! Earlier today, the Dodgers rookie chatted with AlexaDatt and ScottBraun on TheRundown'

In [57]:
def tweet_preprocessor(text):
    """"
    First function that uses tweet-preprocesser library to clean Tweets.
    """
    return p.clean(text).replace(":", "").replace("@", "").replace("#", "").replace(" .", ". ")

In [59]:
for text in subset["full_text"][-5:].apply(lambda x: tweet_preprocessor(x)):
    print(text, "\n")

I was in my cell wit my man marked eating nachos turned all the way upp. .. I was happyyyy as shit 

There is a long history of mistakes in capital punishment, especially when it comes to Black and Brown people. Rodney Reed's execution should be halted—and it's time to end the death penalty for good. 

Israel’s beacon of democracy must always shine through the darkness. And America will be right there, standing proudly alongside her. Thank you for visiting the uscapitol today, Mr. Prime Minister netanyahu. 

More time to work in August? We welcome it so we can work on an issue weighing on the pockets of American families health care. More on our plan LIVE 

ladytrigger_ Hi there. Try the steps in this guide to troubleshoot the issue with "Ok Google" .  Keep us posted. 



In [60]:
%%timeit

subset["clean_text"] = subset["full_text"].apply(lambda x: tweet_preprocessor(x))

41.1 ms ± 846 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [61]:
for text in subset["clean_text"][-5:]:
    print(text, "\n")

I was in my cell wit my man marked eating nachos turned all the way upp. .. I was happyyyy as shit 

There is a long history of mistakes in capital punishment, especially when it comes to Black and Brown people. Rodney Reed's execution should be halted—and it's time to end the death penalty for good. 

Israel’s beacon of democracy must always shine through the darkness. And America will be right there, standing proudly alongside her. Thank you for visiting the uscapitol today, Mr. Prime Minister netanyahu. 

More time to work in August? We welcome it so we can work on an issue weighing on the pockets of American families health care. More on our plan LIVE 

ladytrigger_ Hi there. Try the steps in this guide to troubleshoot the issue with "Ok Google" .  Keep us posted. 



In [83]:
import re

def clean_text(text):
    """
    Second Ffunction that takes in Tweet text and cleans it.
    """
    clean = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    return clean.strip()

In [84]:
for text in subset["full_text"][:5].apply(lambda x: clean_text(x)):
    print(text, "\n")

The National Immigrant Justice Center or  helps asylumseekers win their cases and fights against renewed attempts to separate and jail more families 

Hi Fizan This guide may help  If something looks outofline we suggest adding extra layers of security to your Google Account with these tips 

A 50yd field goal in the University of Phoenix Stadium deflects about onethird inch to the right due to Earths rotation 

Emma Stone Just Got Engaged To Dave McCary And She Looks So Happy 

New York or Nowhere vibes 



In [74]:
for text in subset["full_text"][:5].apply(lambda x: tweet_preprocessor(x)):
    print(text, "\n")

The National Immigrant Justice Center, or NIJC, helps asylum-seekers win their cases and fights against renewed attempts to separate and jail more families. 

FizanKhan1 Hi Fizan. This guide may help.  If something looks out-of-line, we suggest adding extra layers of security to your Google Account with these tips.  

A -yd field goal, in the University of Phoenix Stadium, deflects about one-third inch to the right due to Earth's rotation 

Emma Stone Just Got Engaged To Dave McCary, And She Looks So Happy 

New York or Nowhere vibes. 



In [86]:
# assign lowercase to every word in the text
subset["full_text"][24].lower()

'gavin lux just hit career homer #1! earlier today, the @dodgers rookie chatted with @alexadatt and @scottbraun on #therundown: https://t.co/kuntw8tn0o https://t.co/b3uw1zzpzi'

In [87]:
# delete links from text
re.sub(r"http\S+", "", subset["full_text"][24])

'Gavin Lux just hit career homer #1! Earlier today, the @Dodgers rookie chatted with @AlexaDatt and @ScottBraun on #TheRundown:  '

In [88]:
# matches any non-alphanumeric character (i.e. special characters)
re.sub("\W", " ", subset["full_text"][24])

'Gavin Lux just hit career homer  1  Earlier today  the  Dodgers rookie chatted with  AlexaDatt and  ScottBraun on  TheRundown  https   t co kuNtW8tn0O https   t co b3UW1ZzPzI'

In [90]:
# matches any whitespace character
re.sub("\s+", " ", subset["full_text"][24])

'Gavin Lux just hit career homer #1! Earlier today, the @Dodgers rookie chatted with @AlexaDatt and @ScottBraun on #TheRundown: https://t.co/kuNtW8tn0O https://t.co/b3UW1ZzPzI'

In [92]:
re.sub("[^\w\s]", "", subset["full_text"][24])

'Gavin Lux just hit career homer 1 Earlier today the Dodgers rookie chatted with AlexaDatt and ScottBraun on TheRundown httpstcokuNtW8tn0O httpstcob3UW1ZzPzI'

In [95]:
test = re.sub(r"http\S+|[^\w\s]", "", subset["full_text"][24]).strip(" ")

'Gavin Lux just hit career homer 1 Earlier today the Dodgers rookie chatted with AlexaDatt and ScottBraun on TheRundown'

In [96]:
def text_clean(text):
    """
    Third function for cleaning text.
    """
    # delete links and any alphanumeric character and the underscore, strips whitespace
    text = re.sub(r"http\S+|[^\w\s]", "", text).strip(" ")
    # replaces any non-alphanumeric character with blank space
    text = re.sub("\W", " ", text)
    return text

In [100]:
for text in subset["full_text"][24:26].apply(lambda x: tweet_preprocessor(x)):
    print(text, "\n")

Gavin Lux just hit career homer 1! Earlier today, the Dodgers rookie chatted with AlexaDatt and ScottBraun on TheRundown 

.burnaboy is being sampled by everyone, including Drake, and was named Best International Act by BET for his Afrofusion style. Watch the full interview here 



In [101]:
for text in subset["full_text"][24:26].apply(lambda x: clean_text(x)):
    print(text, "\n")

Gavin Lux just hit career homer 1 Earlier today the  rookie chatted with  and  on TheRundown 

is being sampled by everyone including Drake and was named Best International Act by  for his Afrofusion style Watch the full interview here 



In [102]:
for text in subset["full_text"][24:26].apply(lambda x: text_clean(x)):
    print(text, "\n")

Gavin Lux just hit career homer 1 Earlier today the Dodgers rookie chatted with AlexaDatt and ScottBraun on TheRundown 

burnaboy is being sampled by everyone including Drake and was named Best International Act by BET for his Afrofusion style Watch the full interview here 

