In [2]:
import pandas as pd
import re
import scipy
import collections
import numpy

#### Import db with labels

In [2]:
path = 'data/train_test_dataset.csv'
df_positive_negative = pd.read_csv(path)
df_positive_negative.head()

Unnamed: 0,mn,2,preprocess_tweets
0,"Correct that's because, Mr #Blair you never go...",leave,correct that's because mr <hashtag> you never ...
1,Safer In #EU? No! No! No! Terrorists want the ...,leave,safer in <hashtag> no no no terrorists want th...
2,This. https://t.co/WRtzpWsxiT,leave,this <url>
3,We're going to be swamped by Turks and other f...,leave,we're going to be swamped by turks and other f...
4,#Lexit not #brexit! Public event with great sp...,leave,<hashtag> not <hashtag> public event with grea...


In [4]:
tweets = df_positive_negative[df_positive_negative.columns[[0, 1]]]

In [5]:
tweets.columns = ['tweet', 'label']

In [35]:
from sklearn.utils import shuffle
tweets = shuffle(tweets)
tweets = tweets.reset_index(drop=True)
tweets.head()

Unnamed: 0,tweet,label
0,Stop using my money to peddle pro EU crap #Vot...,leave
1,Spread the Word https://t.co/JDgMoDqLnB #votel...,stay
2,@nanette0803 @viking_tiger @certual @LeaveEUOf...,stay
3,"Trump backs Brexit, brands migration crisis a ...",stay
4,Waiting for backlash as campaign tells Trump t...,stay


## Create label function

### REMAIN

In [36]:
# Common hashtag for stay in.
MOST_COMMON_HASHTAG_REMAIN = r"(?i)SayYes2Europe|(?i)StrongerIN|(?i)bremain|(?i)Stay|(?i)ukineu|(?i)votein|(?i)betteroffin|(?i)leadnotleave|(?i)VoteYES|(?i)yes2eu|(?i)yestoeu"

In [37]:
# def function
def most_common_hashtag(tweet_text):
    return 1 if re.search(MOST_COMMON_HASHTAG_REMAIN, tweet_text) else 0

In [38]:
label_1_1 = []
for line in tweets['tweet']:
    label_1_1.append(most_common_hashtag(line))  
   
a1 = numpy.array(label_1_1)
collections.Counter(a1)

Counter({0: 398, 1: 102})

In [39]:
# vote ... REMAIN or voting ... REMAIN or vote ... Remain
VOTE_REMAIN = r"(?i)(?:vote|voting)\s+((?:\S+\s+){0,7}?)remain|(?i)Remain|(?i)VoteRemain|(?i)voteremain"

In [40]:
# def function
def def_vote_remain(tweet_text):
    return 1 if re.search(VOTE_REMAIN, tweet_text) else 0

In [41]:
label_2_1 = []
for line in tweets['tweet']:
    label_2_1.append(def_vote_remain(line))
    
b1 = numpy.array(label_2_1)
collections.Counter(b1)

Counter({0: 442, 1: 58})

### LEAVE

In [42]:
HASHTAG_LEAVE = r"(?i)euroscepticism|beLeave|betteroffout|britainout|LeaveEU|noTTIP|TakeControl|VoteLeave|VoteNO|voteout|end-of-europe|leaveeuofficial|NoThanksEU|nothankseu|ukleave-eu|vote-leave|leaving EU|strongOut|voteLeave|brexitnow|leaveEUOfficial"

In [43]:
# def function
def common_hashtag_leave(tweet_text):
    return 2 if re.search(HASHTAG_LEAVE, tweet_text) else 0

In [44]:
label_3_1 = []
for line in tweets['tweet']:
    label_3_1.append(common_hashtag_leave(line))
    
c1 = numpy.array(label_3_1)
collections.Counter(c1)

Counter({0: 376, 2: 124})

In [45]:
# vote/voting leave
VOTE_LEAVE = r"(?i)(?:vote|voting)\s+((?:\S+\s+){0,4}?)leave|(?i)vote_leave|vote leave|@vote_leave"

In [46]:
# def function
def def_vote_leave(tweet_text):
    return 2 if re.search(VOTE_LEAVE, tweet_text, re.IGNORECASE) else 0

In [47]:
label_4_1 = []
for line in tweets['tweet']:
    label_4_1.append(def_vote_leave(line))
    
d1 = numpy.array(label_4_1)
collections.Counter(d1)

Counter({0: 465, 2: 35})

In [48]:
VOTE_LEAVE_UKIP = r"(?i)voteukip|ukip|brexitnow|brexit win|brexit winning|leavingeu|leaveEU|leaveEUOfficial|eureferendum|better off"

In [49]:
# def function
def def_vote_leave_ukip(tweet_text):
    return 2 if re.search(VOTE_LEAVE_UKIP, tweet_text) else 0

In [50]:
label_5_1 = []
for line in tweets['tweet']:
    label_5_1.append(def_vote_leave_ukip(line))

e1 = numpy.array(label_5_1)
collections.Counter(e1)

Counter({0: 421, 2: 79})

### Create sparse matrix

In [51]:
from scipy.sparse import coo_matrix
import numpy as np

In [52]:
labelling_function_result = np.column_stack((a1, b1, c1, d1, e1))

In [53]:
labelling_function_result.shape

(500, 5)

In [54]:
labelling_function_result[0:5]

array([[0, 0, 2, 0, 0],
       [0, 0, 2, 2, 0],
       [0, 0, 2, 0, 2],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]])

In [55]:
labelling_dense_matrix = coo_matrix(labelling_function_result)

In [57]:
labelling_dense_matrix = labelling_dense_matrix.tocsr()

In [59]:
import scipy.sparse
scipy.sparse.save_npz('gold_label_matrix.npz', labelling_dense_matrix)

In [60]:
golden_labels = []
for label in tweets['label']:
    if label == 'leave':
        golden_labels.append(2)
    if label == 'stay':
        golden_labels.append(1)

In [61]:
golden_labels = np.asarray(golden_labels)
golden_labels[0:5]

array([2, 1, 1, 1, 1])

In [62]:
np.save('gold_labels', golden_labels)

### PROCESS DATA FOR FUTURE LABELING

In [4]:
path2 = 'data/brexit_unlabelled.csv'
brexit = pd.read_csv(path2)
brexit.head()

2684

In [64]:
label_1_2 = []
for line in brexit['0']:
    label_1_2.append(most_common_hashtag(line))  
   
a2 = numpy.array(label_1_2)
collections.Counter(a2)

Counter({0: 2362, 1: 322})

In [66]:
label_2_2 = []
for line in  brexit['0']:
    label_2_2.append(def_vote_remain(line))

b2 = numpy.array(label_2_2)
collections.Counter(b2)

Counter({0: 2342, 1: 342})

In [67]:
label_3_2 = []
for line in  brexit['0']:
    label_3_2.append(common_hashtag_leave(line))
    
c2 = numpy.array(label_3_2)
collections.Counter(c2)    

Counter({0: 1941, 2: 743})

In [68]:
label_4_2 = []
for line in brexit['0']:
    label_4_2.append(def_vote_leave(line))

d2 = numpy.array(label_4_2)
collections.Counter(d2)

Counter({0: 2516, 2: 168})

In [69]:
label_5_2 = []
for line in brexit['0']:
    label_5_2.append(def_vote_leave_ukip(line))

e2 = numpy.array(label_5_2)
collections.Counter(e2)    

Counter({0: 2496, 2: 188})

### Create sparce matrix without labels

In [71]:
lf_result_2 = np.column_stack((a2, b2, c2, d2, e2))

In [73]:
lf_matrix_2 = coo_matrix(lf_result_2)

In [75]:
lf_matrix_2 = lf_matrix_2.tocsr()

In [76]:
scipy.sparse.save_npz('matrix_for_new_labels.npz', lf_matrix_2)