In [1]:
!pip install transformers torch
!pip install pandas
!pip install transformers[torch]
!pip install accelerate -U



In [2]:
import pandas as pd
import re
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load dataset
file_path = '/content/drive/MyDrive/quotes.csv'  # Adjust the path accordingly
df = pd.read_csv(file_path)

df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,quote,author,category
0,"I'm selfish, impatient and a little insecure. ...",Marilyn Monroe,"attributed-no-source, best, life, love, mistak..."
1,You've gotta dance like there's nobody watchin...,William W. Purkey,"dance, heaven, hurt, inspirational, life, love..."
2,You know you're in love when you can't fall as...,Dr. Seuss,"attributed-no-source, dreams, love, reality, s..."
3,A friend is someone who knows all about you an...,Elbert Hubbard,"friend, friendship, knowledge, love"
4,Darkness cannot drive out darkness: only light...,"Martin Luther King Jr., A Testament of Hope: T...","darkness, drive-out, hate, inspirational, ligh..."


In [3]:
#Reading the data

# Display the shape of the dataset
print("\nShape of the dataset:")
print(df.shape)

# Display the columns and their data types
print("\nColumns and their data types:")
print(df.dtypes)

# Display the number of missing values in each column
print("\nNumber of missing values in each column:")
print(df.isnull().sum())

# Display basic statistics for numerical columns
print("\nBasic statistics for numerical columns:")
print(df.describe())

# Display basic statistics for categorical columns
print("\nBasic statistics for categorical columns:")
print(df.describe(include=['O']))

# Display the number of unique values in each column
print("\nNumber of unique values in each column:")
print(df.nunique())


Shape of the dataset:
(499709, 3)

Columns and their data types:
quote       object
author      object
category    object
dtype: object

Number of missing values in each column:
quote          1
author      1753
category      63
dtype: int64

Basic statistics for numerical columns:
                                                    quote           author  \
count                                              499708           497956   
unique                                             493789           117296   
top     The best way to remember your wife's birthday ...  Debasish Mridha   
freq                                                   18             6601   

                                                 category  
count                                              499646  
unique                                             367918  
top     education, happiness, hope, inspirational, int...  
freq                                                 1648  

Basic statistics for cat

In [4]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\W', ' ', text)   # Remove all non-word characters
    text = text.lower()               # Convert text to lowercase
    return text.strip()

# Apply cleaning to the quote column
df['cleaned_quote'] = df['quote'].apply(lambda x: clean_text(str(x)))

# Display the cleaned data
df.head()


Unnamed: 0,quote,author,category,cleaned_quote
0,"I'm selfish, impatient and a little insecure. ...",Marilyn Monroe,"attributed-no-source, best, life, love, mistak...",i m selfish impatient and a little insecure ...
1,You've gotta dance like there's nobody watchin...,William W. Purkey,"dance, heaven, hurt, inspirational, life, love...",you ve gotta dance like there s nobody watchin...
2,You know you're in love when you can't fall as...,Dr. Seuss,"attributed-no-source, dreams, love, reality, s...",you know you re in love when you can t fall as...
3,A friend is someone who knows all about you an...,Elbert Hubbard,"friend, friendship, knowledge, love",a friend is someone who knows all about you an...
4,Darkness cannot drive out darkness: only light...,"Martin Luther King Jr., A Testament of Hope: T...","darkness, drive-out, hate, inspirational, ligh...",darkness cannot drive out darkness only light...


In [5]:
# Sample 3,000 random rows
df_sampled = df.sample(n=3000, random_state=42)

# Save cleaned quotes to a text file
with open('quotes_sampled.txt', 'w') as f:
    for quote in df_sampled['cleaned_quote']:
        f.write(f"{quote}\n")


In [6]:
# Display sampled data statistics
print("\nSampled Dataset Statistics:")
print(f"Number of rows: {len(df_sampled)}")
print(df_sampled.describe(include='all'))
print(df_sampled.head())


Sampled Dataset Statistics:
Number of rows: 3000
                                                    quote              author  \
count                                                3000                2989   
unique                                               2996                2554   
top     I'm a great believer in luck. I find the harde...  Lailah Gifty Akita   
freq                                                    2                  23   

       category                                      cleaned_quote  
count      3000                                               3000  
unique     2791                                               2996  
top      poetry  i m a great believer in luck  i find the harde...  
freq          8                                                  2  
                                                    quote  \
179178  The sting of her abandonment had not lessened ...   
183253  Everything that falls upon the eye is an appar...   
84139   I don't 

In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, load_metric
import numpy as np
import torch
import matplotlib.pyplot as plt

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token for GPT2

model = GPT2LMHeadModel.from_pretrained('gpt2')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Reduced batch size
    gradient_accumulation_steps=2,  # Gradient accumulation
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="epoch",  # Evaluate every epoch
)

# Load and tokenize dataset
def load_and_tokenize_dataset(df_sampled):
    dataset = load_dataset('text', data_files={'train': df_sampled, 'validation': df_sampled})

    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    return tokenized_datasets

tokenized_datasets = load_and_tokenize_dataset('quotes_sampled.txt')
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Load accuracy metric
accuracy_metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    labels = labels.flatten()
    predictions = predictions.flatten()
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the model
trainer.save_model('./fine_tuned_gpt2')




Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

  accuracy_metric = load_metric('accuracy')


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.37 GiB. GPU 

In [None]:
# Plotting the evaluation accuracy
plt.figure(figsize=(10, 6))
plt.plot(eval_accuracy, label='Evaluation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Evaluation Accuracy')
plt.legend()
plt.show()

# Plotting training loss and evaluation accuracy
train_logs = trainer.state.log_history
train_loss = [metric['loss'] for metric in train_logs if 'loss' in metric]
eval_accuracy = [metric['eval_accuracy'] for metric in train_logs if 'eval_accuracy' in metric]

# Plotting the training loss
plt.figure(figsize=(10, 6))
plt.plot(train_loss, label='Training Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()
plt.show()

In [None]:
# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Function to generate a quote based on a category
def generate_quote(category):
    prompt = f"Motivational quote about {category}: "
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=50, num_return_sequences=1)
    quote = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return quote

# Example usage
category = input("Enter a category: ")
quote = generate_quote(category)
print(f"Motivational quote about {category}: {quote}")
