In [None]:
import csv
import numpy as np 
from transformers import AutoTokenizer, AutoModelWithLMHead
import itertools as it
import pandas as pd
from tqdm import tqdm
import os

# Define the GPU that should be used 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
# Load the model 
PATH = '/home/dobby/gpt-neo/'
MODELNAME = "gpt-neo-1.3B-user-v1"
model = AutoModelWithLMHead.from_pretrained(PATH+MODELNAME, local_files_only=True)
model = model.cuda()
model.config.pad_token_id = model.config.eos_token_id
tokenizer = AutoTokenizer.from_pretrained(PATH+MODELNAME, local_files_only=True)

In [None]:
def create_csv_output(temp, tk, rep_penalty, number_posts, topics, attack_id): 
    artificial_tweets = pd.DataFrame(data={'user': [], 'tweets': []})
    # For each username in topics 
    for topic in topics:
        generated = tokenizer(f"<|startoftext|>{topic}", return_tensors="pt").input_ids.cuda()
        final_output = []
        # GPT generates 10 tweets per call
        if number_posts > 10: 
            for e in range(number_posts//10): 
                outputs_gpu = model.generate(generated, 
                                             do_sample=True,
                                             top_k=tk,
                                             temperature=temp, 
                                             repetition_penalty=rep_penalty,
                                             top_p=1,
                                             max_length=1000,
                                             num_return_sequences=10)
                preprocessed_output = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs_gpu]
                # Replace escape tokens 
                preprocessed_output = [output.replace('\n', ' ').replace('\\', '') for output in preprocessed_output]
                # Reformat from tuple to list 
                preprocessed_output = [(output,) for output in preprocessed_output]
                final_output.extend(preprocessed_output)
                # Reset the mode, else the storage is limited 
                del outputs_gpu
        # If less than ten 10 tweets per call should be generated 
        else: 
            outputs_gpu = model.generate(generated, 
                                         do_sample=True,
                                         top_k=tk,
                                         temperature=temp,
                                         repetition_penalty=rep_penalty,
                                         top_p=1,
                                         max_length=1000,
                                         num_return_sequences=number_posts)
            
            preprocessed_output = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs_gpu]
            # Replace escape tokens 
            preprocessed_output = [output.replace('\n', ' ').replace('\\', '') for output in preprocessed_output]
            # Reformat from tuple to list 
            preprocessed_output = [(output,) for output in preprocessed_output]
            final_output.extend(preprocessed_output)
            # Reset the mode, else the storage is limited 
            del outputs_gpu
        
        # Add the username as another column to the dataframe  
        final_output = [final_output[i][0].replace(topic, "") for i in range(len(final_output))]
        users = [topic for i in range(len(final_output))]
        at = pd.DataFrame(data={'user': users, 'tweets': final_output})
        artificial_tweets = pd.concat([artificial_tweets, at])
        artificial_tweets.reset_index(drop=True, inplace=True)
    
    artificial_tweets.to_csv(PATH + attack_id + ".csv", index=False, encoding='utf-8', sep='|')

In [None]:
create_csv_output(temp=0.8, tk=150, rep_penalty=1.0, attack_id = "test", number_posts=10, topics="GretaThunberg")