Oulu_NLPTM_TwitterBrexit Data Collection and Preprocessing

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import os
import tweepy as tw
import pandas as pd

import re
import string
from unidecode import unidecode

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import PorterStemmer

import sys
sys.path.insert(0, '../config')

from twitter_dev_access import *


# Global Parameters
stop_words = set(stopwords.words('english'))

# amount of tweets collected is tweets_per_page * no_of_pages 
# note that retweets will be directly removed, but are still considered to this number
tweets_per_page = 100 # max 200
no_of_pages = 12

In [None]:
# access twitter
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)


In [None]:
conservatives = ['@BorisJohnson', '@theresa_may']
conservative_house_of_commons = ['@nadams', '@BimAfolami', '@AdamAfriyie', '@imranahmadkhan', '@peter_aldous', '@lucyallan', '@amessd_southend', '@Stuart4WolvesSW', '@Caroline_Ansell', '@AthertonNWales', '@GarethBaconMP', '@richardbaconmp', '@KemiBadenoch', '@ShaunBaileyUK', '@Siobhan_Baillie']
conservatives.extend(conservative_house_of_commons)
labour = ['@HackneyAbbott', '@Debbie_abrahams', '@rushanaraali', '@TahirAliMP', '@DrRosena', '@MikeAmesburyMP', '@PutneyFleur', '@ToniaAntoniazzi', '@JonAshworth', '@PaulaBarkerMP', '@ApsanaBegumMP', '@hilarybennmp', '@_OliviaBlake', '@PaulBlomfieldMP', '@TracyBrabin', '@BenPBradshaw', '@KevinBrennanMP', '@lynbrownmp']

In [None]:
def collect_tweets(twitterusers):
    tweet_list = []

    for n, twitter_name in enumerate(twitterusers):
        # get tweets for user, exlude retweets and get full tweet content
        pages = tw.Cursor(api.user_timeline, screen_name=twitter_name, include_rts=False, tweet_mode="extended", count=tweets_per_page).pages(no_of_pages)
        
        # get tweets from pages
        tweet_texts = [tweet.full_text for page in pages for tweet in page][:1000]

        print('(' + str(n+1) + '/' + str(len(twitterusers)) + ') Collected ' + str(len(tweet_texts)) + ' tweets from ' + twitter_name)

        # only add newest 100 tweets to tweet list
        tweet_list.extend(tweet_texts)
    
    print('All tweets collected for this party: '+ str(len(tweet_list)))
    
    return tweet_list

In [None]:
def clean(tweet):
    # from https://www.pluralsight.com/guides/building-a-twitter-sentiment-analysis-in-python
    tweet = tweet.lower()
    
    # Remove unicode characters (emotes etc)
    tweet = unidecode(tweet)
    
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    # use stemmer
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    
    return " ".join(stemmed_words)
    

In [None]:
def store(tweet_list, filename):
    with open(filename, 'w') as f:
        for tweet in tweet_list:
            f.write("%s\n" % tweet)
    print(f"saved tweets to {filename}")

In [None]:
conservatives_tweets = collect_tweets(conservatives)

In [None]:
cleaned_conservative = [clean(tweet) for tweet in conservatives_tweets]

In [None]:
store(cleaned_conservative, "conservative_tweets_preprocessed.txt")

In [None]:
labour_tweets = collect_tweets(labour)

In [None]:
cleaned_labour = [clean(tweet) for tweet in labour_tweets]

In [None]:
store(cleaned_labour, "labour_tweets_preprocessed.txt")