In [None]:
! pip install transformers datasets
! pip install transformers[torch]
! pip install accelerate

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
import torch

In [None]:
# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
# Load the dataset from Hugging Face
dataset = load_dataset("glavin001/startup-interviews")

In [None]:
# Preprocess the dataset
def preprocess_function(examples):
    inputs = examples["instruction"]
    targets = examples["start"]
    return {"inputs": inputs, "targets": targets}

In [None]:
# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_datasets

In [None]:
# Set block size based on your dataset and memory constraints
block_size = 128

In [None]:
# Tokenize and collate the dataset
def collate_function(examples):
    return tokenizer(examples["inputs"], max_length=block_size, padding="max_length", truncation=True)

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = tokenized_datasets.map(collate_function)

In [None]:
# Set block size for training
tokenized_datasets = tokenized_datasets.with_format("torch")
tokenized_datasets.set_format(columns=["input_ids", "attention_mask"], output_all_columns=True)

In [None]:
# Fine-tune configuration
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_total_limit=2,
    save_steps=10_000,
    learning_rate=0.01,
    prediction_loss_only=True,
)

In [None]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=tokenized_datasets["train"],
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
# Save the fine-tuned model
model.save_pretrained("path/to/saved/model")
tokenizer.save_pretrained("path/to/saved/model")

In [None]:
# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("path/to/saved/model")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("path/to/saved/model")

In [None]:
# Generate an example
prompt = "Can you please introduce yourself and tell us about your role at Y Combinator?"
input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors="pt")

In [None]:
# Generate text
output = fine_tuned_model.generate(input_ids, max_length=150, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, pad_token_id=fine_tuned_model.config.pad_token_id)

In [None]:
generated_text = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Example:")
print(f"prompt: {prompt}")
print(f"model: {generated_text}")

Generated Example:    
prompt: Can you please introduce yourself and tell us about your role at Y Combinator?    
model: Can you please introduce yourself and tell us about your role at Y Combinator? startup??ator and? the? in?bin? idea? and the a? team?C? Com? of the the startup and a a startup in the it?,? their? product? a the of a it and startup to? you the you? itator in a of your the company? is the how? users? initial? them? are the startups? market? your a you a how and your startup of startupbin and how in your your itbin in it in startupator to the product and of how a your how of it to a Y? success? early? what the Y and company and you startup on? to

In [None]:
# Generate an example
prompt = "Do you hae any experiences related with the SW Engineer?"
input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors="pt")

# Generate text
output = fine_tuned_model.generate(input_ids, max_length=150, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, pad_token_id=fine_tuned_model.config.pad_token_id)

generated_text = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Example:")
print(f"prompt: {prompt}")
print(f"model: {generated_text}")

Generated Example:    
prompt: Do you hae any experiences related with the SW Engineer?    
model: Do you hae any experiences related with the SW Engineer? startup?? and the? the the startup and?bin?ator? team? in? a the a? idea? it? Com?C?,? of? are the how? ideas? market? them? is the success? their? to a a startup in the itator and a it and your startup to the company? users?'s? fit? that? industry? on? initial? product? or the your the have? growth? first? venture? for? from? you a how and howator in a success and itbin and startup of the you the Y? stage? new? early?-? company and some? business? funding? what

In [None]:
# Generate an example
prompt = "Do you hae any experiences related with the SW Engineer? one answer, 200 words, job description please."
input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors="pt")

# Generate text
output = fine_tuned_model.generate(input_ids, max_length=150, num_beams=5, no_repeat_ngram_size=1, top_k=2, top_p=0.2, pad_token_id=fine_tuned_model.config.pad_token_id)

generated_text = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Example:")
print(f"prompt: {prompt}")
print(f"model: {generated_text}")

Generated Example:    
prompt: Do you hae any experiences related with the SW Engineer? one answer, 200 words, job description please.    
model: Do you hae any experiences related with the SW Engineer? one answer, 200 words, job description please. to startup in and a of it for on your have- that idea or productbinator are how from's their ideas is what successIs when Y at Com team aboutC stage funding do growth together as be company industry who after startups factor during early people fit)? KP ( does new business MVP impact up them time models" can some world without has this into by hard market venture between so make its if first project over find interview rate before users founders rounds through were should starting work they model using fail 3 data among help came did other my towards strategies tech members school face will building goals used advice factors out come performancetech co takeing companies

In [None]:
# Generate an example
prompt = "Do you hae any experiences related with the SW Engineer? one answer, 200 words, job description please."
input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors="pt")

# Generate text
output = fine_tuned_model.generate(input_ids, max_length=100, num_beams=2, no_repeat_ngram_size=1, top_k=1, top_p=0.6, pad_token_id=fine_tuned_model.config.pad_token_id)

generated_text = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Example:")
print(f"prompt: {prompt}")
print(f"model: {generated_text}")

Generated Example:    
prompt: Do you hae any experiences related with the SW Engineer? one answer, 200 words, job description please.    
model: Do you hae any experiences related with the SW Engineer? one answer, 200 words, job description please. to startup and of a in it for on your have- that or productbinator are how idea's their company from at startups is successIs when what Y ideasC team about Com industry stage funding as do growth who together after be business people fit factor during early KP ( up them MVP impact" time market)? between new world without does can some venture make has this into by goals models

In [None]:
# Generate an example
prompt = "Do you have any experiences related to SW Engineer in startup company?"
input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors="pt")

# Generate text
output = fine_tuned_model.generate(input_ids, max_length=100, num_beams=8, no_repeat_ngram_size=1, top_k=1, top_p=0.95, pad_token_id=fine_tuned_model.config.pad_token_id)

generated_text = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Example:")
print(f"prompt: {prompt}")
print(f"model: {generated_text}")

Generated Example:    
prompt: Do you have any experiences related to SW Engineer in startup company?    
model: Do you have any experiences related to SW Engineer in startup company? the success and a of your it Ybinator for, idea or team on their are how- thatC ideas's product funding from with ventureIs when Com industry at is what be growth stage market fit together people them)? about first who world KP do business new time so MVP after early models during startups were starting future as this ( has hard users rounds up project factor impact make founders tech fail without can some work school data before members


In [None]:
# Generate an example
prompt = "What strategies can be used to encourage users to provide more detailed and comprehensive answers during interviews?"
input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors="pt")

# Generate text
output = fine_tuned_model.generate(input_ids, max_length=100, num_beams=8, no_repeat_ngram_size=1, top_k=3, top_p=0.9, pad_token_id=fine_tuned_model.config.pad_token_id)

generated_text = fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Example:")
print(f"prompt: {prompt}")
print(f"model: {generated_text}")

Generated Example:    
prompt: What strategies can be used to encourage users to provide more detailed and comprehensive answers during interviews?    
model: What strategies can be used to encourage users to provide more detailed and comprehensive answers during interviews? the success in a startupbinator of your team for it you have, how idea on their product or company ideas that- are what growth's YC fundingIs from is some with business venture at goals when hard Com industry stage fit)? about early people new do starting startups were does this who after building time them as help together KP ( should first MVP factor up has an work they impact without into make market

In [None]:
torch.save({
            'model_state_dict': model.state_dict(),
            }, "/model.pt")