In [28]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm 
import re
from gensim.utils import simple_preprocess 
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Read in the dataframe of labeled tweets

tweet_df = pd.read_json('tweet.json')

In [3]:
tweet_df.head()

Unnamed: 0,ID,tweet,label
0,17461978,[RT @CarnivalCruise: 🎉 Are you ready to see wh...,0
1,1297437077403885568,,1
2,17685258,[RT @realDonaldTrump: THANK YOU #RNC2020! http...,0
3,15750898,[A family fears they may have been cheated out...,0
4,1659167666,[RT @VonteThePlug: Yeah but he ain’t got one h...,1


In [4]:
tweet_df.shape

(11826, 3)

In [5]:
profile_df = pd.read_json('profile.json')

In [6]:
profile_df.shape

(11826, 3)

Preprocess the tweets before training the word2vec model

In [10]:
def change_tweets_to_doc(row):
    if row is not None:
        return ' '.join(row)
    else:
        return np.nan

In [11]:
# tweets_doc = tweet_df.copy()
# tweets_doc.loc[:,'tweet'] = tweet_df['tweet'].apply(change_tweets_to_doc)
# tweets_doc.shape


Taking the first 4 users as an example: We explode the tweets to have one tweet per row 

In [12]:
all_tweets_doc = tweet_df[:5].explode('tweet').dropna()
all_tweets_doc.shape

(688, 3)

In [13]:
all_tweets_doc.tweet

0    RT @CarnivalCruise: 🎉 Are you ready to see wha...
0    Who has time for receipts? Not me. @epson rece...
0    Steady wants to encourage you to invest in you...
0    Good one, @rishid. But let’s see if y'all can ...
0                                 #lsunationalchamps\n
                           ...                        
4    When you locking the doors at 10 and a custome...
4    Album Out Now GO GET IT 🔥🔥💪🏾 https://t.co/7X4e...
4    I make hits, you fuck niggas gon learn that th...
4    I’m the illest up &amp; Coming outta Carolina ...
4    Treat me like a king baby talk to me nice 🥰😈 h...
Name: tweet, Length: 688, dtype: object

In [14]:
# Preprocessing using gensim library simple_preprocess function

simple_tokenized = [simple_preprocess(sent) for sent in tqdm(all_tweets_doc.tweet)]

  0%|          | 0/688 [00:00<?, ?it/s]

We now have list of tokenized words from the 4 users, we will train the w2v model on them

Training word2vec model on our corpus

In [24]:
quick_model = Word2Vec(sentences=simple_tokenized, min_count=1, workers=4)

Saving the model's word vectors 

In [25]:
quick_word_vectors = quick_model.wv

To use the word2vec model in calculating similarity between tweets, we apply the following steps:

1) For each tokenized sentence in the corpus, we get its w2v matrix

2) We then calculate their mean to have only one vector for each sentence

3) We concatenate vectors of sentences of each user in a matrix (As an output we will have a matrix of average vectors per sentences for each user)

4) We then calculate the cosine similarity for each user's matrix and take the output number to be the similarity feature for supervised learning part

In [29]:
# We apply here on the 1st four users

similarity = []
for user_sent in tweet_df.tweet[:5]:
    w2v = []
    # Ignoring users where tweet = None
    if user_sent is not None:
        # We take each user sentence, apply preprocessing to it (tokenize it), then if the output list isn't empty,
        # get the w2v representation of the words of the sentence, then take their mean and append them to the w2v array
        # For each user we will have a w2v array created, after all the users sentences are done, we calculate the 
        # cosine similarity between the w2v vectors and append the value to the similarity list, which will be later on
        # appended to the dataset as a feature in the supervised learning model
        for sentence in user_sent:
            tokenized = simple_preprocess(sentence)
            if len(tokenized) > 0:
                mean_w2v = np.mean(quick_word_vectors[tokenized],axis=0)
                w2v.append(mean_w2v)
    
        sim = cosine_similarity(w2v)
        np.fill_diagonal(sim,0)
        similarity.append(sim.mean())
    

In [30]:
len(similarity)

4

In [31]:
similarity

[0.99187434, 0.9942174, 0.99466115, 0.9878583]