In [1]:
import re

def remove_urls_and_entities(text):
    """
    Removes URLs and HTML entities from a string using regular expressions.
    
    Args:
        text (str): The input string to remove URLs and HTML entities from.
        
    Returns:
        str: The input string with any URLs and HTML entities removed.
    """
    # Define regular expressions to match URLs and HTML entities
    url_pattern = re.compile(r'https?://(?:www\.\S+|(?!www)\S+)')
    entity_pattern = re.compile(r'&\w+;')
    
    # Use the sub() method to replace URLs and HTML entities with an empty string
    text_without_urls_and_entities = url_pattern.sub('', text)
    text_without_urls_and_entities = entity_pattern.sub('', text_without_urls_and_entities)
    
    return text_without_urls_and_entities.replace('\n', ' ').strip()

In [2]:
import pickle
from GPT2.encoder import get_encoder

# Read the list of dictionaries from the pickle object
with open('D:\\5005-Data\\tweet_combined_with_sentiment.pkl', 'rb') as f:
    data = pickle.load(f)

print('Data read complete')

# Concatenate the rawContent attribute of each dictionary into a list
tweets_list = []
enc = get_encoder()
for k, tweet in data.items():
    content = remove_urls_and_entities(tweet.rawContent)
    content = '<T-Begin>' + content + '<T-End>'
    encoded = enc.encode(content)
    tweets_list.append(encoded)

max_len = max(len(t) for t in tweets_list)
print('Total number of tweets:', len(tweets_list))
print('Longest tweet by tokens number:', max_len)
del data

Data read complete
Total number of tweets: 1387322
Longest tweet by tokens number: 553


In [3]:
import torch

current = []
new_tweets_list = []
for t in tweets_list:
    current.extend(t)
    if len(current) >= max_len:
        current = current[:max_len]
        new_tweets_list.append(current)
        current = t
    
tweets_tensor = torch.tensor(new_tweets_list, dtype=torch.int64)
tweets_tensor.size()

  from .autonotebook import tqdm as notebook_tqdm


torch.Size([119406, 553])

In [7]:
from GPT2.model import GPT2LMHeadModel
from GPT2.config import GPT2Config

config = GPT2Config()
model = GPT2LMHeadModel(config)

sample = tweets_tensor[:4]
input_ids = sample[:, :552]
label_ids = sample[:, 1:]

loss = model(input_ids, lm_labels=label_ids)
print('Model loss with random parameters:', float(loss))

torch.Size([4, 50257, 552])
torch.Size([4, 552])
Model loss with random parameters: 175.92779541015625


In [6]:
from GPT2.model import GPT2LMHeadModel
from GPT2.utils import load_weight
from GPT2.config import GPT2Config

state_dict = torch.load('gpt2-pytorch_model.bin', map_location='cpu')

config = GPT2Config()
model = GPT2LMHeadModel(config)
model = load_weight(model, state_dict)

sample = tweets_tensor[:4]
input_ids = sample[:, :552]
label_ids = sample[:, 1:]

loss = model(input_ids, lm_labels=label_ids)
print('Model loss with pre-training:', float(loss))

torch.Size([4, 50257, 552])
torch.Size([4, 552])
Model loss with pre-training: 3.9554734230041504
