In [2]:
# Load the pickled data
import pickle

file_location = 'D:\\5005-Data\\tweet_combined_with_sentiment.pkl'

with open(file_location, 'rb') as f:
    tweets_dict = pickle.load(f)

print('Total number of tweets:', len(tweets_dict))

Total number of tweets: 1387322


In [3]:
import numpy as np

like_ratios = []
for k, tweet in tweets_dict.items():
    if tweet.viewCount is None:
        continue
    if tweet.lang != 'en':
        continue
    if tweet.inReplyToUser is not None:
        continue
    ratio = tweet.likeCount / tweet.viewCount
    like_ratios.append(ratio)

p50 = np.percentile(like_ratios, 50)
p90 = np.percentile(like_ratios, 90)

print('Number of tweets that are not reply:', len(like_ratios))
print('Top 50 perecent like ratio:', p50)
print('Top 90 perecent like ratio:', p90)

Number of tweets that are not reply: 776029
Top 50 perecent like ratio: 0.0
Top 90 perecent like ratio: 0.02564102564102564


In [4]:
import re

def remove_urls_and_entities(text):
    """
    Removes URLs and HTML entities from a string using regular expressions.
    
    Args:
        text (str): The input string to remove URLs and HTML entities from.
        
    Returns:
        str: The input string with any URLs and HTML entities removed.
    """
    # Define regular expressions to match URLs and HTML entities
    url_pattern = re.compile(r'https?://(?:www\.\S+|(?!www)\S+)')
    entity_pattern = re.compile(r'&\w+;')
    
    # Use the sub() method to replace URLs and HTML entities with an empty string
    text_without_urls_and_entities = url_pattern.sub('', text)
    text_without_urls_and_entities = entity_pattern.sub('', text_without_urls_and_entities)
    text_without_urls_and_entities = text_without_urls_and_entities.replace('\n', ' ')
    
    return text_without_urls_and_entities

In [5]:
lines = []

for k, tweet in tweets_dict.items():
    if tweet.viewCount is None:
        continue
    if tweet.lang != 'en':
        continue
    if tweet.inReplyToUser is not None:
        continue
    content = remove_urls_and_entities(tweet.rawContent)
    label = 0
    if tweet.likeCount / tweet.viewCount > 0:
        label = 1
    if tweet.likeCount / tweet.viewCount > 0.025:
        label = 2
    lines.append('__label__' + str(label) + ' ' + content)

part = int(len(lines) * 0.1)

f = open('train.ftxt', 'w', encoding='utf-8')
train_lines = lines[:part * 8]
f.write('\n'.join(train_lines))
f.close()

f = open('test.ftxt', 'w', encoding='utf-8')
test_lines = lines[part * 8:part * 9]
f.write('\n'.join(test_lines))
f.close()

f = open('dev.ftxt', 'w', encoding='utf-8')
test_lines = lines[part * 9:]
f.write('\n'.join(test_lines))
f.close()

In [10]:
from flair.datasets import ClassificationCorpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

corpus = ClassificationCorpus(Path('./'), test_file='test.ftxt', dev_file='dev.ftxt', train_file='train.ftxt')
word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
classifier = TextClassifier(document_embeddings, label_type='class', label_dictionary=corpus.make_label_dictionary('class'))


  from .autonotebook import tqdm as notebook_tqdm


2023-04-16 20:27:45,734 Reading data from .
2023-04-16 20:27:45,734 Train: train.ftxt
2023-04-16 20:27:45,734 Dev: dev.ftxt
2023-04-16 20:27:45,735 Test: test.ftxt
2023-04-16 20:28:05,417 Initialized corpus . (label type name is 'class')
2023-04-16 20:28:08,781 Computing label dictionary. Progress:


  document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
620816it [02:14, 4616.79it/s]

2023-04-16 20:30:23,254 Dictionary created for label 'class' with 4 values: 0 (seen 308976 times), 1 (seen 248183 times), 2 (seen 63657 times)





In [None]:
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=2, learning_rate=0.05, mini_batch_size=64, monitor_test=True)

In [7]:
total_test = len(test_lines)

with open('test.tsv', 'r', encoding='utf-8') as f:
    content = f.read()
    wrong_num = content.count('-> MISMATCH!')

print('Accuracy:', 1 - wrong_num / total_test)

Accuracy: 0.5686951591913517
