## Practice

Load the tidyverse.

In [22]:
suppressPackageStartupMessages(library(tidyverse))

### 1. Read in tweets data.

In [23]:
uss <- readRDS("../data/uss.rds")
uss_tweets <- readRDS("../data/uss-tweets.rds")

### 2. Merge the two data sets (left join `uss` with `uss_tweets`).

In [24]:
uss <- left_join(uss, uss_tweets, by = "user_id")

In [25]:
head(uss,2)

user_id,ideology,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,⋯,retweet_verified,place_url,place_name,place_full_name,place_type,country,country_code,geo_coords,coords_coords,bbox_coords
7334402,0.06996857,952725846309195781,2018-01-15 02:15:17,alfranken,<U+201C>We bested the Saints!<U+201D> <U+2014>My four-year-old grandson Joe <U+201C>Holy moly!!!<U+201D> <U+2014>Me https://t.co/gSZRDEyJX9,Twitter for iPhone,74,,,,,,,,,,,"NA, NA","NA, NA","NA, NA, NA, NA, NA, NA, NA, NA"
7334402,0.06996857,949789000923078656,2018-01-06 23:45:18,alfranken,"Hey everybody, I have something that's really important to share: Tina Smith, our new U.S. Senator for Minnesota, has shown time and time again that she's committed to fighting for you, and now we need to do the same for her. So please do something for me. Follow @TinaSmithMN. https://t.co/bkgnSERHMQ",Twitter for iPhone,277,,,,,,,,,,,"NA, NA","NA, NA","NA, NA, NA, NA, NA, NA, NA, NA"


In [7]:
library(plyr)

------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
------------------------------------------------------------------------------

Attaching package: 'plyr'

The following objects are masked from 'package:dplyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize

The following object is masked from 'package:purrr':

    compact



In [26]:
count(uss, 'source')

source,freq
Buffer,27
Echofon,1
Hootsuite,69
Instagram,71
Media Studio,403
Periscope,1
TweetDeck,1805
Twitter Ads,24
Twitter Ads Composer,35
Twitter Lite,8


### 3. Categorize the **source** of the statuses as iphone, ipad, android, web client, or tweet deck, Store this information as a variable `mobile` in the data.

In [27]:
uss <- uss %>%
    mutate(
        mobile = case_when(
            grepl("iphone", source, ignore.case = TRUE) ~ "iphone",
            grepl("ipad", source, ignore.case = TRUE) ~ "ipad",
            grepl("android", source, ignore.case = TRUE) ~ "android",
            grepl("web client", source, ignore.case = TRUE) ~ "web client",
            grepl("tweetdeck", source, ignore.case = TRUE) ~ "tweetdeck",
            TRUE ~ "other"
        )
    )

In [28]:
count(uss,'mobile')

mobile,freq
android,103
ipad,162
iphone,2463
other,657
tweetdeck,1805
web client,4709


### 4. Compare group means of `retweet_count` and `favorite_count`.

In [29]:
uss %>%
    group_by(mobile) %>%
    summarise(retweets = mean(retweet_count, na.rm = TRUE),
              favorites = mean(favorite_count, na.rm = TRUE))

retweets,favorites
465.1783,888.6977


### 5. Define several new features by creating the following variables

tweet_chars
n_hashtags
n_links
n_first_person
n_punct
n_mentions
n_words
char_per_word
n_capitals

In [30]:
uss <- uss %>%
    mutate(
        tweet_chars = nchar(text),
        n_hashtags = stringr::str_count(text, "#\\S+"),
        n_links = stringr::str_count(text, "https?:"),
        n_first_person = stringr::str_count(text, "\\bI\\b|\\bme\\b"),
        n_punct = stringr::str_count(text, "[[:punct:]]"),
        n_mentions = stringr::str_count(text, "@\\S+"),
        n_words = stringr::str_count(text, "\\w+"),
        char_per_word = tweet_chars / n_words,
        n_capitals = stringr::str_count(text, "[A-Z]"))

### 6. Group by `mobile` and compare the two groups along the features you made in the previous question


In [31]:
uss %>%
    select(mobile, tweet_chars:n_capitals) %>%
    group_by(mobile) %>%
    summarise_all(mean, na.rm = TRUE)

mobile,tweet_chars,n_hashtags,n_links,n_first_person,n_punct,n_mentions,n_words,char_per_word,n_capitals
android,191.2524,0.368932,0.6116505,0.4757282,8.31068,0.4466019,31.80583,6.070348,10.980583
ipad,131.3765,0.5185185,0.4444444,0.2777778,7.067901,0.4691358,22.2037,5.996217,8.209877
iphone,160.2651,0.4681283,0.6146975,0.2813642,8.885911,0.9281364,26.40723,6.244554,11.161592
other,180.5686,0.5213415,1.0503049,0.3307927,10.495427,0.3887195,30.1311,6.106561,12.300305
tweetdeck,180.2704,0.5495845,0.7429363,0.3218837,9.570637,0.7944598,29.40499,6.223902,12.224931
web client,188.2805,0.5602039,0.7094925,0.3074963,9.858569,0.776598,30.8316,6.189428,11.996178


In [37]:
##extra added tests
str(uss$text)

 chr [1:9899] "<U+201C>We bested the Saints!<U+201D> <U+2014>My four-year-old grandson Joe\n\n<U+201C>Holy moly!!!<U+201D> <U+"| __truncated__ ...


### 7. Clean the text- remove mentions, URLs, punctuation, and numbers.

In [32]:
uss$text2 <- stringr::str_replace_all(uss$text, "@\\S+", "")
uss$text2 <- stringr::str_replace_all(uss$text2, "https?:[[:graph:]]+", "")
uss$text2 <- stringr::str_replace_all(uss$text2, "[[:punct:]]+\\b|\\b[[:punct:]]+", "")
uss$text2 <- stringr::str_replace_all(uss$text2, "[0-9]", "")
uss$text2 <- tolower(uss$text2)

### 8. Tokenize text into words.

In [33]:
uss$words <- strsplit(uss$text2, "\\s+")

In [34]:
head(uss$words,3)

In [35]:
head(uss,3)

user_id,ideology,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,⋯,n_hashtags,n_links,n_first_person,n_punct,n_mentions,n_words,char_per_word,n_capitals,text2,words
7334402,0.06996857,952725846309195781,2018-01-15 02:15:17,alfranken,<U+201C>We bested the Saints!<U+201D> <U+2014>My four-year-old grandson Joe <U+201C>Holy moly!!!<U+201D> <U+2014>Me https://t.co/gSZRDEyJX9,Twitter for iPhone,74,,,,0,1,0,17,0,17,5.764706,13,we bested the saints my fouryearold grandson joe holy moly me,"we , bested , the , saints , my , fouryearold, grandson , joe , holy , moly , me"
7334402,0.06996857,949789000923078656,2018-01-06 23:45:18,alfranken,"Hey everybody, I have something that's really important to share: Tina Smith, our new U.S. Senator for Minnesota, has shown time and time again that she's committed to fighting for you, and now we need to do the same for her. So please do something for me. Follow @TinaSmithMN. https://t.co/bkgnSERHMQ",Twitter for iPhone,277,,,,0,1,2,18,1,56,5.375,20,hey everybody i have something thats really important to share tina smith our new us senator for minnesota has shown time and time again that shes committed to fighting for you and now we need to do the same for her so please do something for me follow,"hey , everybody, i , have , something, thats , really , important, to , share , tina , smith , our , new , us , senator , for , minnesota, has , shown , time , and , time , again , that , shes , committed, to , fighting , for , you , and , now , we , need , to , do , the , same , for , her , so , please , do , something, for , me , follow"
7334402,0.06996857,946543634496806912,2017-12-29 00:49:23,alfranken,"Working on behalf of the people of Minnesota has been the honor of my life and I could not have done this without the help of my family, friends, and supporters. Thank you. https://t.co/FQwc5qYhHP",Twitter Web Client,196,,,,0,1,1,9,0,37,5.297297,9,working on behalf of the people of minnesota has been the honor of my life and i could not have done this without the help of my family friends and supporters thank you,"working , on , behalf , of , the , people , of , minnesota , has , been , the , honor , of , my , life , and , i , could , not , have , done , this , without , the , help , of , my , family , friends , and , supporters, thank , you"


### 9. Remove stop words any any words that don't consist of letters.

In [17]:
## use stopwords list from rtweet
stopwords <- rtweet::stopwordslangs$word[rtweet::stopwordslangs$p > .999]

## remove stopwords
uss$words <- lapply(uss$words, function(x) return(x[!tolower(x) %in% c("", stopwords)]))
    
## remove all non-letter characters and drop empty tokens
uss$words <- lapply(uss$words, function(x) {
    x <- stringr::str_replace_all(x, "\\W", "")
    x[x != ""]
})

### 10. Create a word frequency table for liberal accounts (ideology scores of less than .5) and conservative accounts (ideology scores of greater than .5).

In [18]:
wds_lib <- table(unlist(uss$words[uss$ideology < .5]))
wds_con <- table(unlist(uss$words[uss$ideology > .5]))

In [19]:
top100_con <- names(sort(wds_con, decreasing = TRUE)[1:100])
top100_lib <- names(sort(wds_lib, decreasing = TRUE)[1:100])

r <- wds_con[names(wds_con) %in% c(top100_con, top100_lib)]
d <- wds_lib[names(wds_lib) %in% c(top100_con, top100_lib)]

In [20]:
## conservative words data frame
cdf <- data_frame(
    ideology = "Conservative",
    word = names(r),
    n = as.integer(r)
)
## liberal words data frame
ldf <- data_frame(
    ideology = "Liberal",
    word = names(d),
    n = as.integer(d)
)
## merge into one
df <- full_join(cdf, ldf)

## convert to wide form (variable for each ideology group's word count)
df <- spread(df, ideology, n)

## calculate differences in word frequencies and arrange by biggest differences
df %>%
    mutate(diff = Liberal - Conservative) %>%
    arrange(-abs(diff))

Joining, by = c("ideology", "word", "n")


word,Conservative,Liberal,diff
taxreform,248.0,1.0,-247.0
dreamers,19.0,246.0,227.0
tax,337.0,111.0,-226.0
senate,441.0,268.0,-173.0
reform,157.0,21.0,-136.0
gun,19.0,148.0,129.0
health,170.0,292.0,122.0
government,195.0,77.0,-118.0
trumps,26.0,143.0,117.0
protect,107.0,213.0,106.0
