In [1]:
import numpy as np
import pandas as pd
import csv
import re

from sklearn.utils import shuffle
from nltk.tokenize import TweetTokenizer

## Load Labelled Brexit tweets

In [2]:
dataset = np.load('data/labeled_tweets.npy')
train_test_dataset = pd.DataFrame.from_records(dataset)

In [3]:
train_test_dataset.columns = ['tweets', 'label']
train_test_dataset.head()

Unnamed: 0,tweets,label
0,"Correct that's because, Mr #Blair you never go...",leave
1,Safer In #EU? No! No! No! Terrorists want the ...,leave
2,This. https://t.co/WRtzpWsxiT,leave
3,We're going to be swamped by Turks and other f...,leave
4,#Lexit not #brexit! Public event with great sp...,leave


In [4]:
print('Dataset for train and test the model: ', len(train_test_dataset))
train_test_dataset.groupby('label').count()

Dataset for train and test the model:  500


Unnamed: 0_level_0,tweets
label,Unnamed: 1_level_1
leave,250
stay,250


#### Data preprocessing

In [5]:
tknzr = TweetTokenizer(reduce_len=True, preserve_case=False,
        strip_handles=False)

FLAGS = re.MULTILINE | re.DOTALL

def preprocess_tweet(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)
    
    text = re_sub(r"[!,.?£$%&|\(\)]", '')
    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    text = re_sub(r"#\S+", "<hashtag>")

    tokens = tknzr.tokenize(text.lower())
    return ' '.join(tokens)

In [6]:
prerocess_column = []
for item in range(0, len(train_test_dataset['tweets'])):
    preproc_tweet = preprocess_tweet(train_test_dataset['tweets'][item])
    prerocess_column.append(preproc_tweet)
train_test_dataset['preprocess_tweets'] = prerocess_column

train_test_dataset.to_csv('train_test_dataset.csv', sep=',', index = False)

## Preprocess data from snorkel  

In [13]:
dataset = np.load('data/new_labelled_brexit.npy')
new_labelled_brexit = pd.DataFrame.from_records(dataset)
new_labelled_brexit.columns = ['tweets', 'label']

In [14]:
new_labelled_brexit.head()

Unnamed: 0,tweets,label
0,RT @raymach1: @David_Cameron according to you ...,leave
1,RT @EU_Mainstream: Yet more evidence that #Bre...,stay
2,The issue in Wales is now one of *perception*....,leave
3,RT @nickeardleybbc: Two UKIP members of the We...,leave
4,Thing people are forgetting about the debate i...,leave


In [15]:
new_labelled_brexit = pd.concat([new_labelled_brexit, train_test_dataset], sort=False)
new_labelled_brexit = new_labelled_brexit.reset_index(drop=True)

In [16]:
prerocess_column = []
for item in range(0, len(new_labelled_brexit['tweets'])):
    preproc_tweet = preprocess_tweet(new_labelled_brexit['tweets'][item])
    prerocess_column.append(preproc_tweet)
new_labelled_brexit['preprocess_tweets'] = prerocess_column

In [17]:
new_labelled_brexit.groupby('label').count()

Unnamed: 0_level_0,tweets,preprocess_tweets
label,Unnamed: 1_level_1,Unnamed: 2_level_1
leave,1133,1133
stay,856,856


In [18]:

new_labelled_brexit.to_csv('data/new_labelled_brexit_preproc.csv', sep=',', index = False)