<a href="https://colab.research.google.com/github/Laura20west/Cone/blob/main/Untitled182.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

def load_and_preprocess(filepath):
    """Load and preprocess a CSV file"""
    try:
        # Try multiple encodings and delimiters
        for encoding in ['utf-8', 'latin1']:
            try:
                df = pd.read_csv(filepath, encoding=encoding, on_bad_lines='skip')
                break
            except:
                continue
        else:
            raise ValueError(f"Could not read {filepath}")

        # Find text column
        text_cols = [col for col in df.columns if df[col].dtype == 'object']
        if not text_cols:
            df['text'] = df.astype(str).agg(' '.join, axis=1)
        else:
            df['text'] = df[text_cols[0]].astype(str)

        # Basic cleaning
        df['text'] = df['text'].str.strip().replace(r'\s+', ' ', regex=True)
        df = df[df['text'].str.len() > 10]  # Remove very short texts

        return df

    except Exception as e:
        print(f"Error processing {filepath}: {str(e)}")
        return pd.DataFrame()

# Load datasets (change filenames as needed)
df = load_and_preprocess('reply_pools_augmented copy.txt')


# Combine and split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Dataset
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df)
})

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")


from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=256,
        padding='max_length',
        return_tensors='np'
    )

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
    num_proc=2
)


from transformers import GPT2LMHeadModel
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# Optimize memory usage
if device == 'cuda':
    model = model.half()  # Use FP16 to save memory


from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Training Configuration (UPDATED PARAMETERS)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    eval_strategy='steps',  # Changed from evaluation_strategy
    eval_steps=500,
    save_strategy='steps',  # Changed from save_steps
    save_steps=500,
    logging_strategy='steps',  # Changed from logging_steps
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=(device == 'cuda'),
    load_best_model_at_end=True,
    report_to='none',
    save_total_limit=2
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator
)

# Start Training
print("Starting training...")
trainer.train()

# Save the final model
model.save_pretrained('./gpt2_finetuned2')
tokenizer.save_pretrained('./gpt2_finetuned2')

# Test the Model
def generate_text(prompt, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    outputs = model.generate(
        inputs,
        max_length=max_length,
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        do_sample=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\nGenerated Text Examples:")
print("1:", generate_text("The future of AI is sexy isn't it?"))
print("2:", generate_text("In my opinion,"))
print("3:", generate_text("Once upon a time"))



Training samples: 911
Validation samples: 102


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map (num_proc=2):   0%|          | 0/911 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/102 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


In [None]:
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

def load_and_preprocess_json(filepath):
    """Load and preprocess the dating Q&A JSON file"""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Convert to dataframe and format for training
        df = pd.DataFrame(data)

        # Combine prompt and completion with special tokens
        df['text'] = df.apply(lambda x: f"<|startoftext|>Question: {x['prompt']}\nAnswer: {x['completion']}<|endoftext|>", axis=1)

        # Basic cleaning
        df['text'] = df['text'].str.strip().replace(r'\s+', ' ', regex=True)
        df = df[df['text'].str.len() > 10]  # Remove very short texts

        return df

    except Exception as e:
        print(f"Error processing {filepath}: {str(e)}")
        return pd.DataFrame()

# Load dataset
df = load_and_preprocess_json('date.json')  # Replace with your JSON filename

# Split dataset
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Dataset
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df)
})

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
# Add special tokens for our formatting
special_tokens_dict = {'additional_special_tokens': ['<|startoftext|>', '<|endoftext|>']}
tokenizer.add_special_tokens(special_tokens_dict)

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=256,
        padding='max_length',
        return_tensors='np'
    )

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
    num_proc=2
)

from transformers import GPT2LMHeadModel
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
# Resize model for new tokens
model.resize_token_embeddings(len(tokenizer))

# Optimize memory usage
if device == 'cuda':
    model = model.half()  # Use FP16 to save memory

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Training Configuration (optimized for Q&A fine-tuning)
training_args = TrainingArguments(
    output_dir='./dating_gpt2_results',
    num_train_epochs=5,  # Slightly more epochs for conversational fine-tuning
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    eval_strategy='steps',
    eval_steps=500,
    save_strategy='steps',
    save_steps=500,
    logging_strategy='steps',
    logging_steps=100,
    learning_rate=3e-5,  # Slightly lower learning rate
    weight_decay=0.01,
    fp16=(device == 'cuda'),
    load_best_model_at_end=True,
    report_to='none',
    save_total_limit=2,
    prediction_loss_only=True
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator
)

# Start Training
print("Starting training...")
trainer.train()

# Save the final model
model.save_pretrained('./gpt2_dating_qa')
tokenizer.save_pretrained('./gpt2_dating_qa')

# Test the Model with dating-specific prompts
def generate_dating_response(prompt, max_length=150):
    formatted_prompt = f"<|startoftext|>Question: {prompt}\nAnswer:"
    inputs = tokenizer.encode(formatted_prompt, return_tensors='pt').to(device)
    outputs = model.generate(
        inputs,
        max_length=max_length,
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        do_sample=True,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    # Decode and clean up the output
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract just the answer part
    answer = full_text.split("Answer:")[1].strip()
    return answer

print("\nGenerated Dating Advice Examples:")
print("1:", generate_dating_response("How can I make a good first impression on a date?"))
print("2:", generate_dating_response("What are some red flags to watch for when dating?"))
print("3:", generate_dating_response("How do I know if I'm ready to start dating again after a breakup?"))

Training samples: 54
Validation samples: 6


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map (num_proc=2):   0%|          | 0/54 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/6 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generated Dating Advice Examples:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


1: Don't be afraid to say something like, "I'm going to have a great time with you." Don' t be shy. Just make sure to be honest. It's okay to tell people you're going out, and it's ok to just say it. If you want to get together, tell them you didn't really go out together. Don ' t make up to go anywhere else. And don't try to pretend to do everything together because they're not. They're just friends. You're already making a lot of friends, right? Just be sure you don 't make it up.' Don t ask them if you feel like you


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


2: The red flag of attraction is always the attraction. Keep your eyes peeled for what's going on around you and don't try to be overly emotional or brash. Be open-minded and friendly. When you're alone, be willing to give it a chance. Don't just say, "I love you so much" or "don't pretend I don. It's okay to have a little teasing. I know that you like me so I want to talk about it. Just don' tuck in a lot. If you feel like you've been teased, just don 't be too emotional' and just be open with it, and be honest.
3: When you're ready, you'll always want to have a good relationship. But if you feel like you might end up dating someone else, it may be important to make sure you've already had a great time. Don't try to figure out what's important for you alone, just stay focused on each other. That means you won't feel lonely or angry, and you know you can get along with someone if they don't like your style.
5. Talk about your feelings with other people for a few minutes. Maybe you just

In [None]:
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

def load_and_preprocess_txt(filepath, delimiter='\t'):
    """Load and preprocess tab-delimited .txt file"""
    try:
        data = []
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                # Split line into prompt and completion
                parts = line.strip().split(delimiter)
                if len(parts) == 2:
                    data.append({
                        'prompt': parts[0].strip(),
                        'completion': parts[1].strip()
                    })

        df = pd.DataFrame(data)

        # Format text with special tokens
        df['text'] = df.apply(
            lambda x: f"<|startoftext|>Question: {x['prompt']}\nAnswer: {x['completion']}<|endoftext|>",
            axis=1
        )

        # Basic cleaning
        df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True).str.strip()
        df = df[df['text'].str.len() > 10]  # Remove very short texts

        return df

    except Exception as e:
        print(f"Error processing {filepath}: {str(e)}")
        return pd.DataFrame()

# Load dataset (REPLACE 'date.txt' with your filename)
df = load_and_preprocess_txt('reply_pools_augmented copy.txt')  # Use tab-delimited text file

# Split dataset (same as before)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Dataset
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df[['text']]),  # Keep only 'text' column
    'validation': Dataset.from_pandas(val_df[['text']])
})

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

# Rest of the code remains unchanged...
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
# Add special tokens for our formatting
special_tokens_dict = {'additional_special_tokens': ['<|startoftext|>', '<|endoftext|>']}
tokenizer.add_special_tokens(special_tokens_dict)

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=256,
        padding='max_length',
        return_tensors='np'
    )

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
    num_proc=2
)

from transformers import GPT2LMHeadModel
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
# Resize model for new tokens
model.resize_token_embeddings(len(tokenizer))

# Optimize memory usage
if device == 'cuda':
    model = model.half()  # Use FP16 to save memory

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Training Configuration (optimized for Q&A fine-tuning)
training_args = TrainingArguments(
    output_dir='./dating_gpt2_results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    eval_strategy='steps',
    eval_steps=500,
    save_strategy='steps',
    save_steps=500,
    logging_strategy='steps',
    logging_steps=100,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=(device == 'cuda'),
    load_best_model_at_end=True,
    report_to='none',
    save_total_limit=2,
    prediction_loss_only=True
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator
)

# Start Training
print("Starting training...")
trainer.train()

# Save the final model
model.save_pretrained('./gpt2_dating_qa')
tokenizer.save_pretrained('./gpt2_dating_qa')

# Test the Model
def generate_dating_response(prompt, max_length=150):
    formatted_prompt = f"<|startoftext|>Question: {prompt}\nAnswer:"
    inputs = tokenizer.encode(formatted_prompt, return_tensors='pt').to(device)
    outputs = model.generate(
        inputs,
        max_length=max_length,
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        do_sample=True,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = full_text.split("Answer:")[1].strip()
    return answer

print("\nGenerated Dating Advice Examples:")
print("1:", generate_dating_response("How can I make a good first impression on a date?"))
print("2:", generate_dating_response("What are some red flags to watch for when dating?"))
print("3:", generate_dating_response("How do I know if I'm ready to start dating again after a breakup?"))

Error processing reply_pools_augmented copy.txt: Cannot set a DataFrame without columns to the column text


ValueError: With n_samples=0, test_size=0.1 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
# Install
!pip install transformers huggingface_hub

# Login
from huggingface_hub import notebook_login
notebook_login()

# Load your fine-tuned model
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "/content/gpt2_finetuned02"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Push to Hub
repo_name = "Xara2west/gpt2-finetuned-cone02"  # Customize this!
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"Model uploaded to: https://huggingface.co/{repo_name}")



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6840b237-0a8d74183a64dbda1a8817d6;09433ccb-02b4-48bf-9454-7e52b863dcf0)

Invalid username or password.

In [None]:
# After training completes, run this cell to download your model
from google.colab import files
import os
import shutil

# 1. Create a zip of the trained model
model_dir = './gpt2_finetuned2'
if os.path.exists(model_dir):
    shutil.make_archive('gpt2_finetuned', 'zip', model_dir)

    # 2. Download the zip file
    files.download('gpt2_finetuned.zip')

    print("Model downloaded successfully!")
else:
    print("Error: Model directory not found. Please train the model first.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model downloaded successfully!


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

def load_and_preprocess_txt(filepath):
    """Load and preprocess a text file with conversation data"""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        # Process conversations (assuming format: User: ...\nAI: ...\n\n)
        conversations = []
        current_convo = []

        for line in lines:
            line = line.strip()
            if line:
                current_convo.append(line)
            else:
                if current_convo:
                    conversations.append(" ".join(current_convo))
                    current_convo = []

        # Add the last conversation if file doesn't end with newline
        if current_convo:
            conversations.append(" ".join(current_convo))

        # Create DataFrame
        df = pd.DataFrame({'text': conversations})

        # Basic cleaning
        df['text'] = df['text'].str.strip().replace(r'\s+', ' ', regex=True)
        df = df[df['text'].str.len() > 10]  # Remove very short texts

        return df

    except Exception as e:
        print(f"Error processing {filepath}: {str(e)}")
        return pd.DataFrame()

# Load dataset from text file
df = load_and_preprocess_txt('conversations.txt')

if df.empty:
    raise ValueError("Could not load the dataset")

# Split into train and validation
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Dataset
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df)
})

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=256,
        padding='max_length',
        return_tensors='np'
    )

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
    num_proc=2
)

from transformers import GPT2LMHeadModel
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# Optimize memory usage
if device == 'cuda':
    model = model.half()  # Use FP16 to save memory

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Training Configuration
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    eval_strategy='steps',
    eval_steps=500,
    save_strategy='steps',
    save_steps=500,
    logging_strategy='steps',
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=(device == 'cuda'),
    load_best_model_at_end=True,
    report_to='none',
    save_total_limit=2
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator
)

# Start Training
print("Starting training...")
trainer.train()

# Save the final model
model.save_pretrained('./gpt2_finetuned')
tokenizer.save_pretrained('./gpt2_finetuned')

# Test the Model with repetition control
def generate_text(prompt, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    outputs = model.generate(
        inputs,
        max_length=max_length,
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.5,  # Added to prevent repetition
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\nGenerated Text Examples:")
print("1:", generate_text("User: Hey there\nAI:"))
print("2:", generate_text("User: What do you like to do for fun?\nAI:"))
print("3:", generate_text("User: Tell me something flirty\nAI:"))

Error processing conversations.txt: [Errno 2] No such file or directory: 'conversations.txt'


ValueError: Could not load the dataset