###  Tweets Classification Using word2vec Word Embeddings as Featrues
Here, we'll predict political party ('Conservative' or 'Labour') of a Member of Parliament (MP) of UK Parliament based on his/her tweets. The data (tweets of ~500 MPs since Aug 23- 2020) were extracted using V2 of Twitter API.

In [1]:
# Setup Library imports.
import nltk
import numpy as np
import sklearn
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import gensim as gs

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hamzaliaqet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Read the csv file dataset into a dataframe
tweets_df = pd.read_csv("UK_MPs_tweets/MPsTweets_from_24Aug_31Aug_2020.csv")
print('total tweets:',len(tweets_df)) # total tweets
tweets_df = tweets_df.sample(frac=1, random_state=1).reset_index(drop=True) # shuffle rows
tweets_df.head()

total tweets: 3464


Unnamed: 0,tweets,MP_twitter_username,Party
0,RT @MoJGovUK: New courtroom protections in som...,edwardtimpson,Conservative
1,Thanks for the warm welcome and for all that y...,JBrokenshire,Conservative
2,@Shaunaryallx @Taurus8Gemini That really isn’t...,RhonddaBryant,Labour
3,RT @KateGreenSU: Today on @GMB I emphasised ho...,Bill_Esterson,Labour
4,RT @SebastianEPayne: Corbyn aide Andrew Murray...,patmcfaddenmp,Labour


In [87]:
# You may define any helper functions in this cell or any other cell if needed.
### BEGIN SOLUTION
# First find hashtags.
def extract_hashtags(tweet_text):
    match_hashtag = re.compile('#\w+')
    hashtags_list = match_hashtag.findall(tweet_text)
    hashtags_list_without_hash_symbol = [hashtag[1:] for hashtag in hashtags_list]
    return hashtags_list_without_hash_symbol

# Split hashtags. Based on Capital letter assumption
def get_words_from_hashtags(hashtag):
    # Extract word if you see 'one' capital letter and any number 
    # of small letters next to it.
    expanded = [a for a in re.split('([A-Z][a-z]+)', hashtag) if a] 
    return expanded # If this is providing more words than probablistic approach (wordninja). Use this.
### END SOLUTION

In [88]:
stopwords = set(nltk.corpus.stopwords.words('english')) | set(["http", "co", "rt", "amp"]) 


In [89]:
# Create class (instead of a function) so that we don't have to pass 
# stopwords in every func call

class PreprocessTweets(object): 
    
    def __init__(self, _stopwords=[]):
        self.stopwords = _stopwords
        
    def __call__(self, tweet_text): # call this everytime an object of this class is instantiated
        ### BEGIN SOLUTION
        hashtags = extract_hashtags(tweet_text)

        # Remove only 10 chars after t.co/  . Any thing else is meaningful
        t_dot_co_url_re = re.compile('https://t.co/\w{10}')
        tweet_text_no_url = t_dot_co_url_re.sub('', tweet_text)

        # Remove, 's e.g. teacher's => teacher
        re_for_removing_s = re.compile("('s)|('S)") # step 1
        tweet_text_no_s = re_for_removing_s.sub('', tweet_text_no_url)

        # Remove apostrophe comma. e.g. won't => wont
        re_for_removing_apostrophe = re.compile("'") # step 2
        tweet_text_no_apostrophe = re_for_removing_apostrophe.sub('', tweet_text_no_s)

        tokenized_text = nltk.word_tokenize(tweet_text_no_apostrophe)
        
        # Keep only unicode chars
        re_for_removing_non_alphanumeric_chars = re.compile("[a-zA-Z0-9_]+")
        tokens_with_alphanumeric_words = []
        for word in tokenized_text:
            words_with_alpha_numeric_chars = re_for_removing_non_alphanumeric_chars.findall(word)
            tokens_with_alphanumeric_words = tokens_with_alphanumeric_words \
                                             + words_with_alpha_numeric_chars
        
        # From tokenized text, remove hashtags- otherwise duplicates might occur.    
        tokenized_text = [token for token in tokens_with_alphanumeric_words if token not in hashtags]

        # Hashtag to words
        hashtag_words_extracted = list(map(lambda hashtag: get_words_from_hashtags(hashtag),
                                           hashtags))
        hashtag_words_in_1D_list = [item for sublist in hashtag_words_extracted 
                                            for item in sublist]

        tokenized_text = tokenized_text + hashtag_words_in_1D_list

        # Convert each word to lower case
        tokenized_text_lowercase = list(map(lambda word: word.lower(), tokenized_text))

        # Lemmatizer
        wnl = WordNetLemmatizer()
        lemmatized_tokens = list(map(lambda word: str(wnl.lemmatize(word)), tokenized_text_lowercase))


        # Stop words removal.
        tokens_without_stop_words = [word for word in lemmatized_tokens \
                                    if word not in self.stopwords]
        return tokens_without_stop_words
        ### END SOLUTION

In [8]:
# Embedding Vectorizer.
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec_model):
        self.word2vec = word2vec_model
        self.dim = 300 # default 300 dim vectors returned by our word2vec 

    def fit(self, X, y):
        return self

    def transform(self, X_list):
        return np.array([
            np.mean([self.word2vec[w] for w in doc if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)  # if word not in word2vec return a vector of zeros
            for doc in X_list
        ])

In [9]:
word2vec_model = gs.models.KeyedVectors.load_word2vec_format(
                        'data/deps.words.bin', binary=True)

In [10]:
emb_vectorizer = MeanEmbeddingVectorizer(word2vec_model)


In [93]:
# Extract tweets text (raw features) and labels.
raw_features_tweets = tweets_df['tweets']
labels = tweets_df['Party']

In [94]:
# Preprocess features using preprocess.
preprocessed_features = raw_features_tweets.apply(func=lambda tweet_text: preprocess(tweet_text))


In [95]:
# Put preprocessed features and labels together again.
preprocessed_df = pd.concat([preprocessed_features, labels], axis=1)
preprocessed_df.head()

Unnamed: 0,tweets,Party
0,"[mojgovuk, new, courtroom, protection, crown, ...",Conservative
1,"[thanks, warm, welcome, great, meet, scope, te...",Conservative
2,"[shaunaryallx, taurus8gemini, really, true, la...",Labour
3,"[kategreensu, today, gmb, emphasised, importan...",Labour
4,"[sebastianepayne, corbyn, aide, andrew, murray...",Labour


In [96]:
# Split into train/test dataset.
train_df, test_df = train_test_split(preprocessed_df, test_size=0.3, 
                                   random_state=42, shuffle=True)

In [97]:
# This is how train set looks like.
train_df.head()

Unnamed: 0,tweets,Party
1042,"[emilythornberry, anyone, hasnt, seen, want, j...",Labour
1601,"[mennewsdesk, breaking, stockport, released, g...",Labour
3122,"[davdotfo, stopinstockport, wednesday, told]",Labour
2831,"[thought, pop, along, see, commuter, london, p...",Conservative
2927,"[sgmacleanauthor, anyone, remembers, 1970s, un...",Labour


In [98]:
# Extract Train tweets text
train_corpus = train_df['tweets']

In [99]:
X_train = emb_vectorizer.transform(train_corpus)# Convert labels to integers [0,1].
Train_labels = train_df['Party']
y_train = [1 if l == 'Conservative' else 0 for l in Train_labels]
y_train[:5] # First 5 labels. 1 for conservative. 0 for labour.

[0, 0, 0, 1, 0]

In [100]:
# Instantiate a SVC.
clf = SVC()

In [101]:
# Train the classifier on train data.
clf.fit(X_train, y_train)

SVC()

In [102]:
# Performance (R2 score) on train data.
clf.score(X_train, y_train) 

0.8514851485148515

In [103]:
# Make TFIDF features of test data.
test_corpus = test_df['tweets']
X_test = emb_vectorizer.transform(test_corpus)
Test_labels = test_df['Party']
# Integer labels.
y_test = [1 if l == 'Conservative' else 0 for l in Test_labels]

In [105]:
# Performance (R2 score) on test data.
clf.score(X_test, y_test) # Highest possible = 1

0.8538461538461538

In [106]:
y_pred = clf.predict(X_test) # already defined- notebook previously was run

In [107]:
# Accuracy on test data.
accuracy_score(y_test, y_pred)

0.8538461538461538

Notice that our word2vec was trained on really small dataset (a few thousand words) instead of real-world Billions of words. Which means most of the words in tweets datset it didn't even recognize (assigned vectors of 0s). Yet, it still performed really well.