### Dataset creation
We first relabel the tweets dataset (test.csv and train.csv) using a *teacher* LLM.
The reason for this is that:
1. The existing labels are not quite accurate
2. We also want to recognize "neutral" posts

We do not preprocess (remove stopwords, normalization, etc.) the tweets since LLMs are trained on natural texts
and therefore doing so would degrade performance.

In [1]:
import pandas as pd
import torch
from tqdm import tqdm
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from logit_processor import ConstrainedLogitProcessor
from common import SENTIMENTS

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else 'cpu'
OUTPUT_PATH = "../data/tweet_sentiments.csv"
print(torch.__version__, "Device:", device)
print(transformers.__version__)

2.7.1+cu126 Device: cuda
4.53.2


##### Prepare tweet data
We won't be using the provided train/test split.

In [3]:
try:
    # Continue from a previous session, if any
    df = pd.read_csv(OUTPUT_PATH)
except:
    df_train = pd.read_csv("../data/train.csv")
    df_test = pd.read_csv("../data/test.csv")
    df = pd.concat((df_train, df_test), ignore_index=True)
    df["sentiment"] = -1 # Remove labels

In [4]:
df.head()

Unnamed: 0,sentiment,text
0,-1,@tonigirl14 love you toooooo!! TG LOL Gngb
1,-1,@jun6lee I told myself: Don't click on this li...
2,-1,The man who rendered his voice to Mickey Mouse...
3,-1,@Shontelle_Layne I think red would be nice. O...
4,-1,@Silverlines - I guess. 'Cause one of her twee...


Prepare the LLM<br>
We configure the model so that it only outputs one token for each sentiment.
We encode each sentiment as follows:
* -1: Missing label
* 0: negative
* 1: neutral
* 2: positive

In [5]:
model_path = "F:/Models/deepseek-llm-7b-chat" # Or use deepseek-ai/deepseek-llm-7b-chat to directly download it
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map=device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:28<00:00, 14.16s/it]


Constrain output to sentiment tokens.
Our implementation only works for sentiment labels which make up exactly one token.

In [6]:
sentiment_tokens = [tokenizer.encode(term, add_special_tokens=False)[0] for term in SENTIMENTS.keys()]
print(sentiment_tokens)
assert len(sentiment_tokens) == len(SENTIMENTS)
sentiment_logit_processor = ConstrainedLogitProcessor(sentiment_tokens)

[20805, 35413, 28573]


In [7]:
system_prompt = {
    "role": "system",
    "content": "You are a specialist in analyzing the sentiments of texts. The user provides you with a text and you only determine and output the sentiment of the text. You must only output one of: negative, neutral, positive - nothing else. Answer with neutral if you cannot confidently identify the sentiment. Pay attention to the language of the text and answer with its slangs in mind."
}

Label the tweets

In [8]:
def prompt_sentiment(texts: list[str]) -> list[str]:
    input_prompts = [
        tokenizer.apply_chat_template([system_prompt, {"role": "user", "content": text}], tokenize=False, add_generation_prompt=True)
        for text in texts
    ]
    input_tensor = tokenizer(input_prompts, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model.generate(
        **input_tensor,
        max_new_tokens=1,
        do_sample=False,
        temperature=None,
        top_p=None,
        pad_token_id=tokenizer.eos_token_id,
        logits_processor=[sentiment_logit_processor]
    )
    res = []
    for i, output in enumerate(outputs):
        prompt_len = input_tensor['input_ids'][i].shape[0]
        generated_tokens = output[prompt_len:]
        res.append(tokenizer.decode(generated_tokens, skip_special_tokens=True).strip())
    return res

In [9]:
# Testing...
prompt_sentiment(["What a good day!", "What a shit day!", "What a meh day", "Der Bundesrechnungshof warnt vor Finanzklemme beim Klima- und Transformationsfonds."])

['positive', 'negative', 'neutral', 'negative']

In [13]:
BATCH_SIZE = 8
target_idx = df.columns.get_loc("sentiment")
start_idx = 0

try:
    start_idx = df[df["sentiment"] < 0].index[0]
    print("Resuming from idx:", start_idx)
except: pass

try:
    for i in tqdm(range(start_idx, len(df), BATCH_SIZE), total=len(df)//BATCH_SIZE):
        df_batch = df.iloc[i:i+BATCH_SIZE]
        sentiment_output = prompt_sentiment(df_batch.text.tolist())
        labels = [SENTIMENTS.get(sentiment, -1) for sentiment in sentiment_output]
        df.iloc[i:i+BATCH_SIZE, target_idx] = labels
finally:
    df.to_csv(OUTPUT_PATH, index=False)

28125it [4:15:41,  1.83it/s]                              


In [14]:
df.describe()

Unnamed: 0,sentiment
count,224994.0
mean,1.216655
std,0.725744
min,0.0
25%,1.0
50%,1.0
75%,2.0
max,2.0
