Oulu_NLPTM_TwitterBrexit Data Collection and Preprocessing

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Julian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Julian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import os
import tweepy as tw

import re
import string
from unidecode import unidecode

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import PorterStemmer

import sys
sys.path.insert(0, '../config')

from twitter_dev_access import *


# Global Parameters
stop_words = set(stopwords.words('english'))

# amount of tweets collected is tweets_per_page * no_of_pages 
# note that retweets will be directly removed, but are still considered to this number
tweets_per_page = 100 # max 200
no_of_pages = 12

In [3]:
# access twitter
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)


In [4]:
conservatives = ['@BorisJohnson', '@theresa_may']
conservative_house_of_commons = ['@nadams', '@BimAfolami', '@AdamAfriyie', '@imranahmadkhan', '@peter_aldous', '@lucyallan', '@amessd_southend', '@Stuart4WolvesSW', '@Caroline_Ansell', '@AthertonNWales', '@GarethBaconMP', '@richardbaconmp', '@KemiBadenoch', '@ShaunBaileyUK', '@Siobhan_Baillie']
conservatives.extend(conservative_house_of_commons)
labour = ['@HackneyAbbott', '@Debbie_abrahams', '@rushanaraali', '@TahirAliMP', '@DrRosena', '@MikeAmesburyMP', '@PutneyFleur', '@ToniaAntoniazzi', '@JonAshworth', '@PaulaBarkerMP', '@ApsanaBegumMP', '@hilarybennmp', '@_OliviaBlake', '@PaulBlomfieldMP', '@TracyBrabin', '@BenPBradshaw', '@KevinBrennanMP', '@lynbrownmp']

In [5]:
def collect_tweets(twitterusers):
    tweet_list = []

    for n, twitter_name in enumerate(twitterusers):
        # get tweets for user, exlude retweets and get full tweet content
        pages = tw.Cursor(api.user_timeline, screen_name=twitter_name, include_rts=False, tweet_mode="extended", count=tweets_per_page).pages(no_of_pages)
        
        # get tweets from pages
        tweet_texts = [tweet.full_text for page in pages for tweet in page][:1000]

        print('(' + str(n+1) + '/' + str(len(twitterusers)) + ') Collected ' + str(len(tweet_texts)) + ' tweets from ' + twitter_name)

        # only add newest 100 tweets to tweet list
        tweet_list.extend(tweet_texts)
    
    print('All tweets collected for this party: '+ str(len(tweet_list)))
    
    return tweet_list

In [6]:
def clean(tweet):
    # from https://www.pluralsight.com/guides/building-a-twitter-sentiment-analysis-in-python
    tweet = tweet.lower()
    
    # Remove unicode characters (emotes etc)
    tweet = unidecode(tweet)
    
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove html encodings like &amp;
    tweet = re.sub(r'&\w+;','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    # use stemmer
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    
    return " ".join(stemmed_words)
    

In [7]:
def store(tweet_list, filename):
    with open(filename, 'w') as f:
        for tweet in tweet_list:
            f.write("%s\n" % tweet)
    print(f"saved tweets to {filename}")

In [8]:
conservatives_tweets = collect_tweets(conservatives)

(1/17) Collected 863 tweets from @BorisJohnson
(2/17) Collected 682 tweets from @theresa_may
(3/17) Collected 665 tweets from @nadams
(4/17) Collected 320 tweets from @BimAfolami
(5/17) Collected 1000 tweets from @AdamAfriyie
(6/17) Collected 373 tweets from @imranahmadkhan
(7/17) Collected 817 tweets from @peter_aldous
(8/17) Collected 684 tweets from @lucyallan
(9/17) Collected 998 tweets from @amessd_southend
(10/17) Collected 653 tweets from @Stuart4WolvesSW
(11/17) Collected 570 tweets from @Caroline_Ansell
(12/17) Collected 878 tweets from @AthertonNWales
(13/17) Collected 247 tweets from @GarethBaconMP
(14/17) Collected 23 tweets from @richardbaconmp
(15/17) Collected 471 tweets from @KemiBadenoch
(16/17) Collected 939 tweets from @ShaunBaileyUK
(17/17) Collected 430 tweets from @Siobhan_Baillie
All tweets collected for this party: 10613


In [9]:
cleaned_conservative = [clean(tweet) for tweet in conservatives_tweets]

In [10]:
store(cleaned_conservative, "conservative_tweets_preprocessed.txt")

saved tweets to conservative_tweets_preprocessed.txt


In [11]:
labour_tweets = collect_tweets(labour)

(1/18) Collected 747 tweets from @HackneyAbbott
(2/18) Collected 633 tweets from @Debbie_abrahams
(3/18) Collected 248 tweets from @rushanaraali
(4/18) Collected 217 tweets from @TahirAliMP
(5/18) Collected 725 tweets from @DrRosena
(6/18) Collected 212 tweets from @MikeAmesburyMP
(7/18) Collected 715 tweets from @PutneyFleur
(8/18) Collected 488 tweets from @ToniaAntoniazzi
(9/18) Collected 488 tweets from @JonAshworth
(10/18) Collected 402 tweets from @PaulaBarkerMP
(11/18) Collected 475 tweets from @ApsanaBegumMP
(12/18) Collected 457 tweets from @hilarybennmp
(13/18) Collected 558 tweets from @_OliviaBlake
(14/18) Collected 788 tweets from @PaulBlomfieldMP
(15/18) Collected 660 tweets from @TracyBrabin
(16/18) Collected 693 tweets from @BenPBradshaw
(17/18) Collected 397 tweets from @KevinBrennanMP
(18/18) Collected 101 tweets from @lynbrownmp
All tweets collected for this party: 9004


In [12]:
cleaned_labour = [clean(tweet) for tweet in labour_tweets]

In [13]:
store(cleaned_labour, "labour_tweets_preprocessed.txt")

saved tweets to labour_tweets_preprocessed.txt
