# 1. Tweet data preprocessing

In [4]:
import os
import sys

nlp_path = os.path.abspath('../')
if nlp_path not in sys.path:
    sys.path.insert(0, nlp_path)
    

In [5]:
nlp_path

'/dstore/home/nguyen/tweet_summarization'

In [6]:
from utils import tokenizeRawTweetText

In [7]:
import pandas as pd
WORK_DIR = "/home/nguyen/"
RAW_DATA = "data/travel_ban.txt"
PROCESSED_DATA = "data/processed_travel_ban.csv"

In [8]:
# Read raw dataset, process and print to file
def read_and_print_data():
    file = open(WORK_DIR+RAW_DATA, "r")
    lines = file.readlines()
    data = []
    i=0
    for line in lines:
       
        s = line.split("\t")
        tweetId = s[0]
        text = s[4]
        text = ' '.join(tokenizeRawTweetText(text))
#         print(tweetId, text)
#         break
        data.append([tweetId, text])
        i+=1
        if i %1000 ==0:
            print("Line {}: {}, label: {}\n".format(i, tweetId, text))
            break
    output = pd.DataFrame(data, columns=['Id', 'Tweet'])
    
    output.to_csv(WORK_DIR+PROCESSED_DATA, index=False)
    file.close()
    
    


In [9]:
read_and_print_data()

Line 1000: 825046054500433925, label: RT @MENTION : Theresa May speaking about several issues in her press conference with Donald Trump saying no tangible thing about issues .



# 2. Extract first token Bert embeddings

In [1]:
import numpy as np
import pandas as pd
import re
import emoji
import pickle
import embedding_extraction as model

In [2]:
data_file = "/home/nguyen/data/processed_travel_ban.csv"

In [3]:
data = pd.read_csv(data_file)

In [4]:
for i in range(10):
    print(str(data.iloc[i]['Tweet']))

RT @MENTION : Emergency Rally Against Trump's Muslim Travel Ban in NYC , 1/25 at 5 p.m. @URL
RT @MENTION : Theresa May has not apologized to Trump for insulting him . If she fails to do that today , Trump should just send her back to B …
RT @MENTION : Trump's Immigration Ban Excludes Countries with Business Ties @URL via @MENTION #DemocracyFor …
RT @MENTION : Trump's immigration order expands the definition of " criminal " @URL @URL
ALERT : Senator John McCain Threatens Action On President Trump If He Does This @URL
@MENTION @MENTION @MENTION @MENTION @MENTION Kiva still distracted while Trump gets on with people's business .
RT @MENTION : TY @MENTION for bailing on GMB & @MENTION today . Piers Morgan drank the Trump Kool Aid & is a vocal opponent o …
RT @MENTION : ✍🏻 #Trump to sign EO temporary ban suspending visas for Syria & six other ME , African countries #BuildTheWall 👍🏼 …
RT @MENTION : Did we have a moral obligation to stop Hitler ? If so we have a moral obligation to stop Trump

In [5]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())

In [6]:
for i in range(10):
    print(str(data.iloc[i]['Tweet']))

emergency rally against trump's muslim travel ban in nyc , 1/25 at 5 p.m.
theresa may has not apologized to trump for insulting him . if she fails to do that today , trump should just send her back to b …
trump's immigration ban excludes countries with business ties via #democracyfor …
trump's immigration order expands the definition of " criminal "
alert : senator john mccain threatens action on president trump if he does this
kiva still distracted while trump gets on with people's business .
ty for bailing on gmb & today . piers morgan drank the trump kool aid & is a vocal opponent o …
#trump to sign eo temporary ban suspending visas for syria & six other me , african countries #buildthewall  …
did we have a moral obligation to stop hitler ? if so we have a moral obligation to stop trump .
are these people just now getting radicalized by trump or did they always hate our freedom ?


In [7]:
first_token_embeddings = model.get_bert_first_token_embeddings(data)

Max_len (99% data): 37.0
Encoded data: 
                                      attention_mask  \
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...   
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   

                                           input_ids  \
0  [101, 5057, 8320, 2114, 8398, 1005, 1055, 5152...   
1  [101, 14781, 2089, 2038, 2025, 17806, 2000, 83...   
2  [101, 8398, 1005, 1055, 7521, 7221, 23329, 201...   
3  [101, 8398, 1005, 1055, 7521, 2344, 24545, 199...   
4  [101, 9499, 1024, 5205, 2198, 19186, 17016, 28...   

                                      token_type_ids  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0

In [8]:
with open('/home/nguyen/data/travel_ban_first_token_embeddings.pkl', 'wb') as f:
    pickle.dump(first_token_embeddings, f)

# 3. Extract bert all-token embeddings

In [1]:
import numpy as np
import pandas as pd
import pickle
import re
import emoji
import embedding_extraction as model
from scipy.sparse import csr_matrix, save_npz

In [2]:
data_file = "/home/nguyen/data/processed_travel_ban.csv"

In [3]:
data = pd.read_csv(data_file)

In [4]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())

In [5]:
for i in range(10):
    print(str(data.iloc[i]['Tweet']))

emergency rally against trump's muslim travel ban in nyc , 1/25 at 5 p.m.
theresa may has not apologized to trump for insulting him . if she fails to do that today , trump should just send her back to b …
trump's immigration ban excludes countries with business ties via #democracyfor …
trump's immigration order expands the definition of " criminal "
alert : senator john mccain threatens action on president trump if he does this
kiva still distracted while trump gets on with people's business .
ty for bailing on gmb & today . piers morgan drank the trump kool aid & is a vocal opponent o …
#trump to sign eo temporary ban suspending visas for syria & six other me , african countries #buildthewall  …
did we have a moral obligation to stop hitler ? if so we have a moral obligation to stop trump .
are these people just now getting radicalized by trump or did they always hate our freedom ?


In [None]:
file = "/home/nguyen/data/all_tokens/travel_ban_"
model.get_bert_all_token_embeddings(data, file = file)

Max_len (99% data): 37.0
Encoded data: 
                                      attention_mask  \
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...   
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   

                                           input_ids  \
0  [101, 5057, 8320, 2114, 8398, 1005, 1055, 5152...   
1  [101, 14781, 2089, 2038, 2025, 17806, 2000, 83...   
2  [101, 8398, 1005, 1055, 7521, 7221, 23329, 201...   
3  [101, 8398, 1005, 1055, 7521, 2344, 24545, 199...   
4  [101, 9499, 1024, 5205, 2198, 19186, 17016, 28...   

                                      token_type_ids  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0

# 4. Extract sentence transformer embeddings

In [None]:
import numpy as np
import pandas as pd
import pickle
import re
import emoji
import embedding_extraction as model

In [2]:
data_file = "/home/nguyen/data/processed_travel_ban.csv"

In [3]:
data = pd.read_csv(data_file)

In [4]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())

In [5]:
for i in range(10):
    print(str(data.iloc[i]['Tweet']))

emergency rally against trump's muslim travel ban in nyc , 1/25 at 5 p.m.
theresa may has not apologized to trump for insulting him . if she fails to do that today , trump should just send her back to b …
trump's immigration ban excludes countries with business ties via #democracyfor …
trump's immigration order expands the definition of " criminal "
alert : senator john mccain threatens action on president trump if he does this
kiva still distracted while trump gets on with people's business .
ty for bailing on gmb & today . piers morgan drank the trump kool aid & is a vocal opponent o …
#trump to sign eo temporary ban suspending visas for syria & six other me , african countries #buildthewall  …
did we have a moral obligation to stop hitler ? if so we have a moral obligation to stop trump .
are these people just now getting radicalized by trump or did they always hate our freedom ?


In [7]:
sentence_embeddings = model.get_sentence_transformers_embedings(data)

Len:  (1000, 768)
Len:  (2000, 768)
Len:  (3000, 768)
Len:  (4000, 768)
Len:  (5000, 768)
Len:  (6000, 768)
Len:  (7000, 768)
Len:  (8000, 768)
Len:  (9000, 768)
Len:  (10000, 768)
Len:  (11000, 768)
Len:  (12000, 768)
Len:  (13000, 768)
Len:  (14000, 768)
Len:  (15000, 768)
Len:  (16000, 768)
Len:  (17000, 768)
Len:  (18000, 768)
Len:  (19000, 768)
Len:  (20000, 768)
Len:  (21000, 768)
Len:  (22000, 768)
Len:  (23000, 768)
Len:  (24000, 768)
Len:  (25000, 768)
Len:  (26000, 768)
Len:  (27000, 768)
Len:  (28000, 768)
Len:  (29000, 768)
Len:  (30000, 768)
Len:  (31000, 768)
Len:  (32000, 768)
Len:  (33000, 768)
Len:  (34000, 768)
Len:  (35000, 768)
Len:  (36000, 768)
Len:  (37000, 768)
Len:  (38000, 768)
Len:  (39000, 768)
Len:  (40000, 768)
Len:  (41000, 768)
Len:  (42000, 768)
Len:  (43000, 768)
Len:  (44000, 768)
Len:  (45000, 768)
Len:  (46000, 768)
Len:  (47000, 768)
Len:  (48000, 768)
Len:  (49000, 768)
Len:  (50000, 768)
Len:  (51000, 768)
Len:  (52000, 768)
Len:  (53000, 768)
Le

In [8]:
sentence_embeddings.shape

(123385, 768)

In [9]:
with open("/home/nguyen/data/travel_ban_sentence_transformers_embeddings.pkl", "wb") as f:
    pickle.dump(sentence_embeddings, f)