In [1]:
import pandas as pd
import numpy as np
import re

## Data

### Read in data

We should probably look at early lectures and copy some of the methods for looking at the data past just shape--it'll probably look fancy and show off that we know stuff and whatnot

In [2]:
## Read in Twitter data
data = pd.read_csv('../data/tweets_10-24-2020.csv', index_col = "date", parse_dates = True)
data.sort_values(by = ['date'], inplace = True)

data.head()

Unnamed: 0_level_0,id,text,isRetweet,isDeleted,device,favorites,retweets
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-05-04 18:54:25,1698308935,Be sure to tune in and watch Donald Trump on L...,f,f,Twitter Web Client,939,519
2009-05-05 01:00:10,1701461182,Donald Trump will be appearing on The View tom...,f,f,Twitter Web Client,259,34
2009-05-08 13:38:08,1737479987,Donald Trump reads Top Ten Financial Tips on L...,f,f,Twitter Web Client,37,15
2009-05-08 20:40:15,1741160716,New Blog Post: Celebrity Apprentice Finale and...,f,f,Twitter Web Client,29,11
2009-05-12 14:07:28,1773561338,"""""""My persona will never be that of a wallflow...",f,f,Twitter Web Client,1877,1321


---

### Create variables

In [3]:
## Create column of hashtags for each tweet
hash_patt = re.compile("(?<=#)\S+")
hashtags = [hash_patt.findall(i) for i in data.text]

## Replace empty lists with NA values
for i in range(len(hashtags)):
    if hashtags[i] == []:
        hashtags[i] = float("NaN")

data["hashtags"] = hashtags

In [4]:
## Create column of mentions for each tweet
at_patt = re.compile("(?<=@)\S+")
ats = [at_patt.findall(i) for i in data.text]

## Replace empty lists with NA values
for i in range(len(ats)):
    if ats[i] == []:
        ats[i] = float("NaN")

data["mentions"] = ats

---

### Clean tweets

In [5]:
clean_text = []

for tweet in data.text:
    # Remove URLS
    tweet = re.sub("(\w+:\/\/\S+)", " ", tweet)
    # Replace non-characters with spaces
    tweet = re.sub("\W", " ", tweet)
    # Remove digits
    tweet = re.sub("[0-9]", "", tweet)
    # Remove duplicate spaces
    tweet = re.sub("\s+", " ", tweet)
    tweet = re.sub("^\s+", "", tweet)
    tweet = re.sub("\s+$", "", tweet)
    
    clean_text.append(tweet.lower())

## Add updated text to main data
data["clean_text"] = clean_text

In [6]:
data

Unnamed: 0_level_0,id,text,isRetweet,isDeleted,device,favorites,retweets,hashtags,mentions,clean_text
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2009-05-04 18:54:25,1698308935,Be sure to tune in and watch Donald Trump on L...,f,f,Twitter Web Client,939,519,,,be sure to tune in and watch donald trump on l...
2009-05-05 01:00:10,1701461182,Donald Trump will be appearing on The View tom...,f,f,Twitter Web Client,259,34,,,donald trump will be appearing on the view tom...
2009-05-08 13:38:08,1737479987,Donald Trump reads Top Ten Financial Tips on L...,f,f,Twitter Web Client,37,15,,,donald trump reads top ten financial tips on l...
2009-05-08 20:40:15,1741160716,New Blog Post: Celebrity Apprentice Finale and...,f,f,Twitter Web Client,29,11,,,new blog post celebrity apprentice finale and ...
2009-05-12 14:07:28,1773561338,"""""""My persona will never be that of a wallflow...",f,f,Twitter Web Client,1877,1321,,,my persona will never be that of a wallflower ...
...,...,...,...,...,...,...,...,...,...,...
2020-10-24 18:54:47,1320076289034850306,Joe Biden = Biggest Tax Increase In History an...,f,f,Twitter for iPhone,57012,11744,,,joe biden biggest tax increase in history and ...
2020-10-24 18:55:38,1320076502839459842,MAKE AMERICA GREAT AGAIN!,f,f,Twitter for iPhone,152417,24873,,,make america great again
2020-10-24 18:56:09,1320076630065373184,AMERICA FIRST!,f,f,Twitter for iPhone,89661,14937,,,america first
2020-10-24 20:11:38,1320095628546899968,Nobody is showing up for Obama’s hate laced sp...,f,f,Twitter for iPhone,57061,10961,,,nobody is showing up for obama s hate laced sp...


---

### Output clean tweets and meta data

In [11]:
data.to_csv("../data/tweet_clean.csv", index = False)