Oulu_NLPTM_TwitterBrexit Data Collection and Preprocessing

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Julian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Julian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import os
import tweepy as tw
import pandas as pd

import re
import string
from unidecode import unidecode

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import PorterStemmer

import sys
sys.path.insert(0, '../config')

from twitter_dev_access import *


# Global Parameters
stop_words = set(stopwords.words('english'))

# amount of tweets collected is tweets_per_page * no_of_pages 
# note that retweets will be directly removed, but are still considered to this number
tweets_per_page = 100 # max 200
no_of_pages = 12

In [3]:
# access twitter
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)


In [4]:
conservatives = ['@BorisJohnson', '@theresa_may']
conservative_house_of_commons = ['@nadams', '@BimAfolami', '@AdamAfriyie', '@imranahmadkhan', '@peter_aldous', '@lucyallan', '@amessd_southend', '@Stuart4WolvesSW', '@Caroline_Ansell', '@AthertonNWales', '@GarethBaconMP', '@richardbaconmp', '@KemiBadenoch', '@ShaunBaileyUK', '@Siobhan_Baillie']
conservatives.extend(conservative_house_of_commons)
labour = ['@HackneyAbbott', '@Debbie_abrahams', '@rushanaraali', '@TahirAliMP', '@DrRosena', '@MikeAmesburyMP', '@PutneyFleur', '@ToniaAntoniazzi', '@JonAshworth', '@PaulaBarkerMP', '@ApsanaBegumMP', '@hilarybennmp', '@_OliviaBlake', '@PaulBlomfieldMP', '@TracyBrabin', '@BenPBradshaw', '@KevinBrennanMP', '@lynbrownmp']

In [5]:
def collect_tweets(twitterusers):
    tweet_list = []

    for n, twitter_name in enumerate(twitterusers):
        # get tweets for user, exlude retweets and get full tweet content
        pages = tw.Cursor(api.user_timeline, screen_name=twitter_name, include_rts=False, tweet_mode="extended", count=tweets_per_page).pages(no_of_pages)
        
        # get tweets from pages
        tweet_texts = [tweet.full_text for page in pages for tweet in page][:1000]

        print('(' + str(n+1) + '/' + str(len(twitterusers)) + ') Collected ' + str(len(tweet_texts)) + ' tweets from ' + twitter_name)

        # only add newest 100 tweets to tweet list
        tweet_list.extend(tweet_texts)
    
    print('All tweets collected for this party: '+ str(len(tweet_list)))
    
    return tweet_list

In [6]:
def clean(tweet):
    # from https://www.pluralsight.com/guides/building-a-twitter-sentiment-analysis-in-python
    tweet = tweet.lower()
    
    # Remove unicode characters (emotes etc)
    tweet = unidecode(tweet)
    
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove html encodings like &amp;
    tweet = re.sub(r'&\w+;','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    # use stemmer
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    
    return " ".join(stemmed_words)
    

In [7]:
def store(tweet_list, filename):
    with open(filename, 'w') as f:
        for tweet in tweet_list:
            f.write("%s\n" % tweet)
    print(f"saved tweets to {filename}")

In [8]:
conservatives_tweets = collect_tweets(conservatives)

(1/17) Collected 872 tweets from @BorisJohnson
(2/17) Collected 682 tweets from @theresa_may
(3/17) Collected 664 tweets from @nadams
(4/17) Collected 320 tweets from @BimAfolami
(5/17) Collected 1000 tweets from @AdamAfriyie
(6/17) Collected 371 tweets from @imranahmadkhan
(7/17) Collected 817 tweets from @peter_aldous
(8/17) Collected 683 tweets from @lucyallan
(9/17) Collected 997 tweets from @amessd_southend
(10/17) Collected 653 tweets from @Stuart4WolvesSW
(11/17) Collected 570 tweets from @Caroline_Ansell
(12/17) Collected 878 tweets from @AthertonNWales
(13/17) Collected 247 tweets from @GarethBaconMP
(14/17) Collected 23 tweets from @richardbaconmp
(15/17) Collected 471 tweets from @KemiBadenoch
(16/17) Collected 937 tweets from @ShaunBaileyUK
(17/17) Collected 430 tweets from @Siobhan_Baillie
All tweets collected for this party: 10615


In [9]:
cleaned_conservative = [clean(tweet) for tweet in conservatives_tweets]

In [10]:
store(cleaned_conservative, "conservative_tweets_preprocessed.txt")

saved tweets to conservative_tweets_preprocessed.txt


In [11]:
labour_tweets = collect_tweets(labour)

(1/18) Collected 748 tweets from @HackneyAbbott
(2/18) Collected 628 tweets from @Debbie_abrahams
(3/18) Collected 239 tweets from @rushanaraali
(4/18) Collected 217 tweets from @TahirAliMP
(5/18) Collected 722 tweets from @DrRosena
(6/18) Collected 214 tweets from @MikeAmesburyMP
(7/18) Collected 719 tweets from @PutneyFleur
(8/18) Collected 499 tweets from @ToniaAntoniazzi
(9/18) Collected 487 tweets from @JonAshworth
(10/18) Collected 409 tweets from @PaulaBarkerMP
(11/18) Collected 473 tweets from @ApsanaBegumMP
(12/18) Collected 464 tweets from @hilarybennmp
(13/18) Collected 552 tweets from @_OliviaBlake
(14/18) Collected 787 tweets from @PaulBlomfieldMP
(15/18) Collected 662 tweets from @TracyBrabin
(16/18) Collected 687 tweets from @BenPBradshaw
(17/18) Collected 396 tweets from @KevinBrennanMP
(18/18) Collected 102 tweets from @lynbrownmp
All tweets collected for this party: 9005


In [12]:
cleaned_labour = [clean(tweet) for tweet in labour_tweets]

In [13]:
store(cleaned_labour, "labour_tweets_preprocessed.txt")

saved tweets to labour_tweets_preprocessed.txt


0. watch live updat coronaviru 20 octob 2020
1. congratul win second term new zealand pm work togeth tackl climat chang forg excit new trade partnership uk nz great thing look forward futur
2. next week new rapid covid19 test triall across countri enabl us test nh care home staff much frequent
3. watch live updat coronaviru 16 octob 2020
4. updat negoti eu
5. good meet today alway support fantast farmer work tirelessli pandem keep countri fed outsid eu chanc get better fairer deal britain food farm sector
6. simplifi standardis local rule england introduc three local covid alert level level accompani proportion restrict find local covid alert level area postcod checker
7. watch live updat coronaviru 12 octob 2020
8. real risk lost gener girl never return school let happen global recoveri pandem must leav one behind dayofthegirl urg global action stand everi girl right 12 year qualiti educ
9. today recognis local often unsung hero whose hard work dedic help carri us pandem care compass 

1045. wish christian countri around world bless goodfriday
1046. want wish jewish famili gather around seder tabl even happi peac pesach today chanc celebr enorm last contribut made jewish commun uk everi walk life passov
1047. must protect improv environ inherit went us respons pass inherit next gener
1048. en solidarit avec la franc et le francai le cloch de labbay de westminst sonneront 18h43 heur de pari ce soir pour marquer linstant ou le feu sest declar hier
1049. underlin solidar franc peopl bell westminst abbey toll 543pm even mark moment fire began yesterday
1050. thought peopl franc tonight emerg servic fight terribl blaze notredam cathedr
1051. 18 million worker benefit biggest ever increas nationallivingwag month protect lowestpaid worker help build economi work everyon
1052. act protect rent privat sector unfair evict give tenant longterm certainti peac mind deserv
1053. wish commun celebr vaisakhi happi peac time reflect valu truth toler compass hope renew
1054. ban degra

1226. mani congratul chosen candid next gener elect know youll excel job work repres local area
1227. fun afternoon great question pupil burchett green maidenhead help award impress st nichola primari hurst
1228. realli enjoy answer question morn
1229. believ everi fibr cours set right one countri peopl
1230. deal deliv brexit british peopl vote
1231. firmli believ head heart decis best interest entir unit kingdom
1232. uniqu strength ingenu forg global futur countri everi bit excit anyth come togeth secur futur prosper gener come
1233. one hundr year gun fell silent western front everi one us paus reflect immens sacrific made mani armisticeday100
1234. proud repres countri thiepval presid today battlefield franc scar horror war import moment reflect friend partner europ
1235. st symphorien honour lay wreath grave john parr georg ellison first last uk soldier die war rememb everi member arm forc gave live protect hold dear never forget
1236. great open new factori thatcham today meet m

1786. 50000 brit return australia alon last month
1787. idiot tweet someon who never job real world
1788. thank graduat uk govern cheven programm fundrais effort total ship 800 ventil china seek deliveri
1789. sincer condol famili staff member kill myanmar transport covid19 sampl yet anoth remind ceasefir need rakhin face challeng covid19 critic aid worker full access protect
1790. news uk govern charter flight home manila
1791. info govt charter flight back pakistan follow see detail
1792. love cover
1793. thank pleasur construct discuss progress updat help brit return home
1794. hope find explan use bull stori peopl appear fallen
1795. team across asia region 100 focus help british peopl return home updat work far latest advic still need return uk pleas check travel advic
1796. may educ know plenti clever halfwit appear one
1797. spot gaffer
1798. time extraordinari challeng may wish everyon bless easter sunday stayhomestaysaf
1799. spot
1800. bori got power thank great
1801. havent 

2672. spot rishi flow research innov idea underpin uk futur job prosper place world budget2020
2673. exactli right approach chancellor nh must get what need deal coronaviru budget2020
2674. solid open budget chancellor ps30bn fiscal stimulu see small busi worker lowest paid coronaviru much might need short term budget2020
2675. longterm solut adult social care mean must look polici make sensibl decis im glad govern share view meet minist urg boost adult social care precept may well way forward
2676. good serv deleg legisl committe today increas child tax credit disabl benefit children least well household year ahead ensur household gain increas incom moretodo
2677. sadden attend funer wonder dee quick st stephen st agn warm kind fun yet principl profession offic thought john famili goodby nice ladi mayor children would say ill miss
2678. look forward strengthen tie ghana develop nation new independ trade polici im glad govern agre globalbritain
2679. wonder morn beauti rich biodivers c

2757. there still time vote pleas vote conserv let countri break destruct deadlock parliament let move get brexit done focu thing peopl care like nh polic boost job prosper voteconserv getbrexitdon
2758. great see mani peopl campaign conserv win across constitu today ive realli enjoy good wish toot support hope major conserv win well selfconfid open outwardlook nation voteconserv
2759. good see mani peopl motiv vote today spencer denney poll station thank vote matter vote useyourvot voteconserv
2760. im awar problem power waitros sunningdal poll station still open your make way vote park station car park walk across useyourvot
2761. well miser weather whitegrov poll station warfield peopl still turn larg number talk determin cast vote howev vote today thank vote take part democraci pleasevot voteconserv
2762. windsor campaign young old credit parti countri posit campaign throughout determin help candid win conserv major move beyond current gridlock voteconserv
2763. rain pour binfield 

3529. attend excel cross parti event parliament everyon know someon affect mentalhealthawarenessweek
3530. scienc much fear remain eu much fear brexit
3531. prison must begin new chapter life convict look forward queensspeech
3532. nobodi stigmatis mental health issu 2016 astonish 90 mhaw16
3533. statist children care devast could overturn extra bursari
3534. excel articl children care dont choos parent famili circumst
3535. best wish mhaw16 outdat attitud toward mental health need chang
3536. address datchet voteleav eu tonight strong turnout great qa young josephyear 12wa star show put yr poster
3537. great welcom marco pierr white magnific castl hotel windsor top cuisin top hotel top town
3538. dan jarvi put serv petti parti polit dan continu true servant yorkshir
3539. weekend visit hepworth part second round cultur recoveri fund grant delight hepworth receiv ps146726 addit ps1519848 alreadi award cultur organis wakefield hereforcultur
3540. congratul wakefield afc fantast win womb

3596. pleas meet parliament discuss plan wakefield futur advoc wakefield town fund applic continu fight wakefield recoveri growth
3597. appreci real concern within constitu local lockdown measur could impos ask publish data inform decis impos eas specif lockdown measur certain area kirkle greater manchest
3598. yesterday pleas meet chairman creativ director yorkshir imperi brass band band 80 year old regard one greatest countri term reput perform qualiti
3599. congratul everyon receiv gcse result wakefield across countri im pleas hard work reward
3600. hi unabl find correspond pleas send correspond imranmpuk includ full postal address look forward receiv email repli accordingli
3601. im pleas ofqual accept centr assess grade scrap standardis algorithm ensur student abl progress educ back effort downgrad result previou year
3602. fought freedom restor peac world anniversari everi day hereaft rememb vjday75
3603. great visit charlesworth press morn meet leader fantast grow local busi eve

4415. great pleasur meet waveney ambul crew saturday
4416. continu campaign januari fuel duti rise meet hard work local haulier
4417. pleas petrol mp week waveney
4418. plsd see govern given suffolk ps314000 help keep poorest peopl local area warm winter
4419. work trust hospit hard work ambul crew investig caus delay waveney
4420. superfast broadband way waveney eu delay clear waveney beccl bungay lowestoft
4421. detail mobil home bill finalis committe stage week
4422. thank pupil school waveney excel entri christma card competit waveney
4423. welcom support disabl peopl work waveney
4424. david ruffley mp urg chancellor scrap fuel duti increas waveney fairfuel
4425. continu call gvt review busi rate impact hard work local busi unfairli waveney
4426. support mening campaign child victim waveney
4427. campaign fuel cost come month winter kick ahead usual fuel duti rise waveney
4428. welcom govern invest north sea oil ga week waveney lowestoft
4429. must maximis inward invest build huge

4598. financ bill committe 1pm 4307
4599. press minist futur lowestoft magistr court today justic question chamber lowestoft
4600. excel respons idea local peopl interest help run retain librari scc considerin cont
4601. hope get fisheri debat afternoon low speaker list spoken mani time lowestoft cfp
4602. grant leav financ bill committe speak energi bill debat chamber later
4603. rest peac david cairn
4604. great effort trawler boy good solid foundat next season ltfc
4605. use meet riddel construct lowestoft today hear concern
4606. excel trip bell tower beccl today incred view
4607. congratul mark bee new leader scc look forward work mark
4608. met hydrosub recruit lowestoft aim get involv renew recruit
4609. rainbow nurseri lowestoft discuss fund issu morn
4610. subscrib blog
4611. subscrib blog waveney beccl bungay oulton lowestoft
4612. afternoon attend lowestoft samaritan agm valuabl organis support peopl need waveney
4613. surgeri oulton broad librari number import constitu conc

5328. lot telford resid wait news thank glad govt listen tweak scheme coronaviru job retent scheme deadlin extend 19th march luci allan
5329. amaz stori well done
5330. well done furlough scheme cutoff date extend 19 march
5331. errr heard dalian atkinson case
5332. polic twitter shropshir polic apologis twitter joke taser baton shropshir star
5333. clever funni show
5334. perfect time
5335. hmm take bit think
5336. complet disconnect peopl struggl aw time guess tri justifi exist
5337. good choic
5338. one ask question constitu need answer
5339. show brilliant dont miss quiz tonight
5340. coalbrookdal primari isnt
5341. that wonder
5342. arent mp work help constitu except difficult time need help
5343. happi birthday alistair 42nd birthday even better
5344. back day
5345. gorgeou clip politician mostli human bbc news labour jonathan ashworth experi bbc dad moment
5346. 
5347. govern act miscarriag justic industri scale
5348. council leader talk local resili forum known easter weekend t

5478. delight attend open fantast new human block morn
5479. follow question matt hancock thursday pleas visit short street coronaviru test centr southend friday
5480. worldalzheimersday encourag peopl start convers dementia reduc stigma fear surround condit visit get start today
5481. beauti day visit wonder allot
5482. morn pleas join great british septemb clean help keepbritaintidi get involv
5483. dogoftheday
5484. wish jewish commun happi roshhashanah
5485. next constitu surgeri held 1000 1300 friday 2nd octob st peter church eastbourn grove westcliffonsea ss0 0qf appoint must book advanc call 020 7219 3452 email amessduk
5486. delight see constitu lakhbir kulbir sandhu surgeri today good news lakhbir back uk endur sever month czech prison
5487. follow secretari state health social care statement told southend avoid lockdown repetit chao short street test centr earlier week
5488. follow earlier question met secretari state emphasis must take action obscen salari bbc present also l

6208. everi one us make differ take rubbish home us delight govern put much focu environ respons
6209. elect promis honour third read european union withdraw agreement bill approv hous common stop unelect lord leav european union end month brexit
6210. work resid long time regard recent even peopl contact express frustrat concern night flight pollut particular
6211. would like thank everyon team everyon gave first prefer vote
6212. send good wish newli elect speaker team process enabl distinguish friend colleagu polit brutal busi
6213. saw beauti dog walk parliament earlier today get pictur dogoftheday
6214. realli love anim look dogoftheday continu improv anim welfar gener particular stop live export anim
6215. wednesday stand candid becom deputi speaker mp sinc 1983 want serv hous new way hope colleagu give first prefer vote ballot
6216. great govern invest ps80m new fund music school
6217. want everyth possibl preserv environ alreadi improv enhanc
6218. delight addit support made av

6330. fantast learn go receiv two addit scanner one ct one mri improv screen earli diagnosi cancer invest hope save live nh
6331. earlier today spoke lack fund southend west lousi ticket machin instal station
6332. mani peopl seen support sell poppi train station across uk great see bob oleari westcliff station morn collect donat import caus poppyappeal2019
6333. tri buy ticket westcliff station morn ticket offic close machin broken new useless system also pneumat drill use close passeng must health safeti guidanc
6334. make sure tragedi grenfel never ever happen spoken import debat grenfel follow public phase 1 sir martin moorebick report
6335. yesterday host event parliament support childmentalhealthchart 1 8 young peopl least one mental disord six point plan seek ensur mental health servic suitabl 21st centuri children
6336. earlier afternoon spoke westminst hall debat endometriosi 15 million women uk suffer result condit done support endometriosisatwork
6337. ask local health execu

7092. save dgh group met baro cumberleg chair nation matern review week
7093. here hansard link speech debat humanitarian crisi mediterranean europ
7094. court closur consult 1hr nearest court car public transport eastbourn hast
7095. good hear justic minist confirm local propos keep court servic local consid
7096. chamber today justic improv educ prison reduc reoffend hope potenti contribut
7097. congratul eastbourn week two usopen good luck petra kvitova
7098. learn use defibril eastbourn
7099. statement reguge situat yesterday therel announc govern next day
7100. step meet volunt save life event truli awesom work
7101. great meet skateworld today discuss way help support colin jo superb facil
7102. work today propos keep justic court serviceslatest magistr insight
7103. thought us enjoy airshow affect tragic shoreham crash tonight
7104. excel meet today promot sussex busi christma ho ho
7105. purpos meet salvat armi eastbourn talk share concern around homeless
7106. use meet look fo

7915. carersweek join thank carer fantast work wrexham across uk unpaid carer provid lifelin mani commun behalf everyon wrexham thank
7916. id also like thank gather peac wrexham weekend adher social distanc guidelin
7917. statement blacklivesmatt protest
7918. absolut right reassess huawei place 5g network china recent sabrerattl reaffirm belief propos involv economi curtail
7919. glimps ive wrexham week
7920. alway grate courag alway rememb grave sacrific made name freedom human dday76 lestweforget
7921. today 76th anniversari dday join think brave men land beach normandi 76 year ago fought die free evil fascism
7922. england scotland made move toward dentist return work urg
7923. letter offer full support creat crimin offenc trespass
7924. mani wrexham busi alreadi benefit busi rate grant scheme make sure dont miss applic close 30th june inform appli
7925. 6yearold jimmi run mile everi day he run length marathon saveourzoo
7926. biggest chariti zoo uk resid wrexham grate doorstep co

7997. today firefightersmemorialday recognis braveri sacrific firefight lost live whilst protect commun fmd2020
7998. fantast video resourc avail teacher parent suitabl home school classroom environ primari pupil secondari student
7999. commemor 75th anniversari veday75 parliament curat materi archiv visit explor key legisl famou speech stori reconstruct bomb damag
8000. check latest beyondthebubbl podcast
8001. 122346 test carri yesterday surpass govern target well done everyon help achiev
8002. congratul nomin award excel work member volunt tenni vote jonathon
8003. take part favouritef answer question celebr import preciou green space favourit childhood park favourit local park favourit oversea park favourit park memori favourit thing park
8004. record first ever podcast last night discuss everyth effect covid19 wrexham power ballad
8005. colleagu call upon ensur covid19 test avail wale match rest uk peopl wale must left behind suffer
8006. must develop addit test site across wale e

8746. caller right iopc need signific reform
8747. great see former colleagu citi hall make progress nation stage receiv promot make huge contribut countri
8748. 
8749. close 35 fire station clearli dont faintest idea your talk
8750. excel choic great job citi hall make brilliant mp
8751. founder chair incompar relat today present lya annual recommend mayor london citi hall lta would exist without kati head univers leav thrive lya good hand
8752. huge turnout final conserv parti leadership hust backbori
8753. context ps78 million equat 130 polic offic entir wrong prioriti nicetohav essenti get grip london ever increas violent crime epidem far import
8754. pleas see wonder pennoy centr award ps68700 grant govern cultur recoveri fund well done team pennoy hard work especi difficult time
8755. back boriss deal fulfil promis leav eu perfect dump backstop crucial peopl want leav deal focu instead school fund nh cut crime hous getbrexitdon
8756. hard sure qualiti anyth suprem court judgement

8874. excel respons cant overemphasis much preciou time tori mp spend rebut fake news orchestr campaign instead serv constitu send mani sort letter everi week
8875. weird pl follow ill send dm instead definit correct address think know bounc back
8876. youv still got detail sent mep 2015 experi happi share id love read guarante would use could come use futur debat email profil thank
8877. ha sure ive got permiss third would need consult variou stakehold husband whip etc
8878. wasnt end final debat start around 105am home 2am deleg legisl later morn 855amand babi boy wake 542 like clockwork
8879. pl stop retweet propaganda malevol distort polit gain zac say best
8880. good econom news paper today ahead budget boe say pay rise unemploy fall cbi survey show strongest manufactur order 30 yr export order 22yr firm confid growth continu
8881. union came assembl dont want happen
8882. peopl said sameth dockland light railway go happen tim
8883. dear essex follow say get extra invest essex pol

9644. chanc better citi becom opportun rout poverti rout crime know youth worker 20 year give peopl opportun thrive end day want thing
9645. 8pm im debat race here ill say believ equal opportun reason behind london success equal isnt defund polic end capit unlock potenti everi london black white
9646. 
9647. youth worker 20 year ive stop search hundr time ill 8pm talk legaci stephen lawrenc whether britain chang issu shook london uk effect still felt
9648. long way weve come far im confid
9649. thank andrew
9650. 1993 teenag call stephen lawrenc murder black 27 year later see black live matter protest statu toppl histori scrutinis itv want know britain chang tune tomorrow 8pm hear ive got say
9651. spoke polic cut sadiq khan play polit peopl live safeti mayor side law order shouldnt defund polic agre sign petit
9652. said
9653. 
9654. charg polic london announc plan cut polic budget ps110m that 1692 offic street london deserv feel safe agre sign petit
9655. know id prefer
9656. need ge