In [1]:
import csv
import random
import pandas as pd
from tqdm import tqdm
from itertools import islice
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

SEED = 1337
random.seed(SEED)

INFO 02-13 02:49:10 __init__.py:190] Automatically detected platform cuda.


In [2]:
df = pd.read_csv("../../data/data_raw/tweets.csv", encoding='latin-1', names=["target", "ids", "date", "flag", "user", "text"])
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
df.drop(["target", "ids", "date", "flag", "user"], axis=1, inplace=True)
df.to_csv("../../data/data_human/tweets.csv", index=False)

In [10]:
def batchify(iterable, batch_size):
    """Splits an iterable into smaller batches."""
    iterable = iter(iterable)
    while batch := list(islice(iterable, batch_size)):
        yield batch

def save_to_csv(path, prompts, responses, temperature, top_p, top_k):
    """Saves prompts, responses and sampling parameters to a CSV file."""
    with open(path, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for prompt, response in zip(prompts, responses):
            writer.writerow([prompt, response, temperature, top_p, top_k])

def generate_responses(model, prompts_tokens, sampling_params):
    """Generate a batch of outputs using vLLM with customizable sampling parameters."""
    outputs = model.generate(sampling_params=sampling_params, prompt_token_ids=prompts_tokens, use_tqdm=False)
    
    return [sample.outputs[0].text for sample in outputs]

In [None]:
BASE_PROMPT = [{"role": "system", "content": "You are a helpful asistant for rewritting tweets. Based on provided tweet generate a similar one. MAKE SURE TO REPLAY ONLY WITH THE SIMILAR TWEET."},
                {"role": "user", "content": "Tweet: \n {tweet} \n"},
                {"role": "assistant", "content": "Similar tweet: \n"}]

In [20]:
prompts = [
    [
        BASE_PROMPT[0],  # The system message
        {"role": "user", "content": BASE_PROMPT[1]["content"].format(tweet=tweet)},  # Formatted user message
        BASE_PROMPT[2]  # The assistant message
    ]
    for tweet in df["text"].values
]

In [11]:
sampling_params = [
    SamplingParams(temperature=0.0, top_p=1.0, top_k=-1, max_tokens=10_000, seed=SEED),  # Pure Greedy (fully deterministic)
    SamplingParams(temperature=0.2, top_p=1.0, top_k=-1, max_tokens=10_000, seed=SEED),  # Highly Deterministic
    SamplingParams(temperature=0.5, top_p=0.95, top_k=100, max_tokens=10_000, seed=SEED), # Mildly Deterministic but Flexible
    SamplingParams(temperature=0.7, top_p=0.9, top_k=50, max_tokens=10_000, seed=SEED),  # Balanced and Natural
    SamplingParams(temperature=0.9, top_p=0.8, top_k=40, max_tokens=10_000, seed=SEED),  # Slightly More Diverse but Coherent
    SamplingParams(temperature=1.0, top_p=0.95, top_k=30, max_tokens=10_000, seed=SEED), # Default Creative Mode
    SamplingParams(temperature=1.2, top_p=0.7, top_k=20, max_tokens=10_000, seed=SEED),  # Highly Creative
]

In [8]:
llms = ["meta-llama/Llama-3.2-1B-Instruct"]
batch_size = 8
base_path = "../../data/data_ai/tweets/tweets_"

In [21]:
for llm in llms:
    #model = LLM(model=llm, dtype="half", max_model_len = 10_000)
    tokenizer = AutoTokenizer.from_pretrained(llm)
    csv_path = f"{base_path}{llm.split('/')[-1]}.csv"


    # init csv file
    with open(csv_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["prompt", "response", "temperature", "top_p", "top_k"])

    cnt = 0
    for batch in tqdm(batchify(prompts, batch_size), total=len(prompts) // batch_size):
        params = random.choice(sampling_params)
        prompts_tokens = tokenizer.apply_chat_template(batch)
        prompts_text = tokenizer.batch_decode(prompts_tokens)
        responses = generate_responses(model, prompts_tokens, params)
        save_to_csv(csv_path, prompts_text, responses, params.temperature, params.top_p, params.top_k)
        cnt += 1
        if cnt > 2:
            break

  responses = generate_responses(model, prompts_tokens, params)
  0%|          | 2/200000 [00:02<81:14:05,  1.46s/it]


In [22]:
prompts_text

["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 13 Feb 2025\n\nYou are a helpful asistant for rewritting tweets. Based on provided tweet generate a similar one. MAKE SURE TO REPLAY ONLY WITH THE SIMILAR TWEET.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTweet: \n Hollis' death scene will hurt me severely to watch on film  wry is directors cut not out now? \n Similar tweet:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nSimilar tweet:<|eot_id|>",
 '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 13 Feb 2025\n\nYou are a helpful asistant for rewritting tweets. Based on provided tweet generate a similar one. MAKE SURE TO REPLAY ONLY WITH THE SIMILAR TWEET.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTweet: \n about to file taxes  \n Similar tweet:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nSimilar tweet:<|eot_id|>

In [23]:
responses

['<|start_header_id|>assistant<|end_header_id|>\n\n"RIP Hollis. The thought of his death scene being released on film still sends chills down my spine. Was it really necessary for a director\'s cut, considering it was never intended for release?"',
 '<|start_header_id|>assistant<|end_header_id|>\n\n"The wait is over! Soon I\'ll be filing my taxes, hoping I\'ve done my part for the IRS and made the most of this tax season. #taxseason #filingtaxes"',
 '<|start_header_id|>assistant<|end_header_id|>\n\n@NetflixCanHelp "Ahah cant wait to see Rent again! The soundtrack is EVERYTHING"',
 '<|start_header_id|>assistant<|end_header_id|>\n\n@KennySaoSweeeeet I think I saw you sipping from the forgotten table drinks again...',
 '<|start_header_id|>assistant<|end_header_id|>\n\n@designlover I was too busy most of the day to focus on design projects',
 '<|start_header_id|>assistant<|end_header_id|>\n\n"One of my friends asked me to meet up at Mid Valley today, but I\'ve got a ton of work to do *sigh

In [76]:
tmp = responses[0]

In [84]:
# remove special tokens and return only the tweet using tokenizer
tokenizer.decode(tokenizer.encode(tmp), skip_special_tokens=True)

'assistant\n\n"Sleep tight tonight... just can\'t wait to see you again tomorrow morning"'

In [26]:
import re

text = "<|start_header_id|>assistant<|end_header_id|>\n\n\"Sleep tight tonight... just can't wait to see you again tomorrow morning\""

# Define a pattern to match from <|start_header_id|> to <|end_header_id|> (inclusive)
pattern = r"<\|start_header_id\|>.*?<\|end_header_id\|>"

# Use re.sub to remove the entire matched block; flags=re.DOTALL makes '.' match newline characters
cleaned_text = re.sub(pattern, "", text, flags=re.DOTALL).strip()

print(cleaned_text)


"Sleep tight tonight... just can't wait to see you again tomorrow morning"
