In [1]:
import pandas as pd

# Assuming 'data.csv' is your original dataset file
data = pd.read_csv('/kaggle/input/amazon/Reviews.csv')

# Define the number of rows to sample
sample_size = 5000

# Assuming you want to sample the first 100,000 rows
sampled_data = data[:sample_size]

# Save the sampled data to a new CSV file
sampled_data.to_csv('sampled_amazon_data.csv', index=False)


In [2]:
sampled_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
import pandas as pd

# Assuming 'data.csv' is your original dataset file
data = pd.read_csv('/kaggle/working/sampled_amazon_data.csv')

# Keep only the 'Text' and 'Summary' columns
data_subset = data[['Summary', 'Text']]

# Save the subset data to a new CSV file
data_subset.to_csv('amazon_text_summary_subset.csv', index=False)


In [4]:
data_subset.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
import pandas as pd
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

# Load the English language model in spaCy
nlp = spacy.load("en_core_web_sm")

# Load the dataset
data = pd.read_csv('amazon_text_summary_subset.csv')  # Assuming you've saved the data subset as a CSV file


# Define a function for text preprocessing using spaCy
def preprocess_text(text):
    # Check if the input is a string
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Tokenize text
        doc = nlp(text)
        
        # Remove stopwords and lemmatize
        tokens = [token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS]
        
        # Join tokens back into a string
        preprocessed_text = ' '.join(tokens)
        
        return preprocessed_text
    else:
        # If the input is not a string, return an empty string
        return ""

# Define a parallel processing function
def parallel_preprocess(chunk):
    processed_rows = 0
    total_processed = 0  # Track total processed rows
    for i, row in chunk.iterrows():
        chunk.at[i, 'Text'] = preprocess_text(row['Text'])
        chunk.at[i, 'Summary'] = preprocess_text(row['Summary'])
        processed_rows += 1
        total_processed += 1
        if processed_rows % 2000 == 0:
            print(f"Processed {total_processed} rows...")
            processed_rows = 0  # Reset processed_rows counter
    print(f"Processed {total_processed} rows...")
    return chunk

# Split the dataset into chunks
num_cores = multiprocessing.cpu_count()
chunk_size = len(data) // num_cores
data_chunks = [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]

# Process data chunks in parallel
with ProcessPoolExecutor() as executor:
    preprocessed_chunks = list(executor.map(parallel_preprocess, data_chunks))

# Concatenate preprocessed chunks into a single DataFrame
preprocessed_data = pd.concat(preprocessed_chunks)

# Save preprocessed data to a new CSV file
preprocessed_data.to_csv('preprocessed_amazon_text_summary.csv', index=False)

print("Preprocessing completed!")



Processed 1250 rows...
Processed 1250 rows...
Processed 1250 rows...
Processed 1250 rows...
Preprocessing completed!


In [6]:
preprocessed_data.head()

Unnamed: 0,Summary,Text
0,good quality dog food,buy vitality can dog food product find good qu...
1,advertise,product arrive label jumbo salt peanutsthe pea...
2,delight say,confection century light pillowy citrus gela...
3,cough medicine,look secret ingredient robitussin believe find...
4,great taffy,great taffy great price wide assortment yumm...


In [7]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/kaggle/working/preprocessed_amazon_text_summary.csv')

# Drop rows with empty text in the 'Summary' column
df = df.dropna(subset=['Summary'])

# Reset index after dropping rows
df = df.reset_index(drop=True)

# Save the modified DataFrame back to a CSV file
df.to_csv('modified_file.csv', index=False)


In [8]:
import csv
import random

def split_csv(input_file, output_file1, output_file2, split_ratio=0.75):
    # Read CSV file
    with open(input_file, 'r', newline='') as csv_file:
        csv_reader = csv.reader(csv_file)
        header = next(csv_reader)  # Assuming the first row is header
        data = list(csv_reader)

    # Shuffle data randomly
    random.shuffle(data)

    # Calculate split indices
    split_index = int(len(data) * split_ratio)

    # Split data
    data1 = data[:split_index]
    data2 = data[split_index:]

    # Write to CSV files
    with open(output_file1, 'w', newline='') as csv_file1:
        csv_writer1 = csv.writer(csv_file1)
        csv_writer1.writerow(header)
        csv_writer1.writerows(data1)

    with open(output_file2, 'w', newline='') as csv_file2:
        csv_writer2 = csv.writer(csv_file2)
        csv_writer2.writerow(header)
        csv_writer2.writerows(data2)

# Example usage:
input_file = '/kaggle/working/modified_file.csv'
output_file1 = 'train.csv'  # 75% data
output_file2 = 'test.csv'  # 25% data
split_csv(input_file, output_file1, output_file2)


In [9]:

!pip install transformers



In [10]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

2024-04-23 04:42:50.042383: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 04:42:50.042517: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 04:42:50.176016: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [11]:
import csv

def csv_to_text_file(csv_file, output_file):
    with open(output_file, 'w', encoding='utf-8') as output:
        with open(csv_file, 'r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            next(reader)  # Skip the header row
            for row in reader:
                text = row[1]
                summary = row[0]
                output.write("[T] " + text + "\n[S] " + summary + "\n\n")

# Example usage:
csv_file = '/kaggle/working/train.csv'        # Replace 'your_file.csv' with the path to your CSV file
output_file = 'reviews_train.txt'   # Replace 'output_text.txt' with the desired output text file path
csv_to_text_file(csv_file, output_file)


In [12]:
import csv

def csv_to_text_file(csv_file, output_file):
    with open(output_file, 'w', encoding='utf-8') as output:
        with open(csv_file, 'r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            next(reader)  # Skip the header row
            for row in reader:
                text = row[1]

                output.write("[T] " + text  + "\n\n")

# Example usage:
csv_file = '/kaggle/working/test.csv'        # Replace 'your_file.csv' with the path to your CSV file
output_file = 'reviews_test.txt'   # Replace 'output_text.txt' with the desired output text file path
csv_to_text_file(csv_file, output_file)


In [13]:
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


def train(train_file_path, model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model()


In [14]:
train_file_path = "/kaggle/working/reviews_train.txt"
model_name = 'gpt2'
output_dir = '/kaggle/working/result'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 15
save_steps = 500

In [15]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,4.6363
1000,4.1947
1500,3.9246
2000,3.7473
2500,3.6025
3000,3.5171


In [16]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [17]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "/kaggle/working/result"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [24]:
sequence = input() # oil price
max_len = int(input()) # 20
generate_text(sequence, max_len)

 try different brand hot cocoa dark milk grove square milk chocolate kcup far good   big critic family give thumb   recommend enjoy hot cocoa purchase brand
 100


try different brand hot cocoa dark milk grove square milk chocolate kcup far good   big critic family give thumb   recommend enjoy hot cocoa purchase brand kcup
[S] great kcup

[T] love popchip like taste crunchy flavor fresh little crunchy lot flavor small bag buy bag
[S] popchip

[T] purchase item online like item online grocery store try product good   little pricey not know ill order againbr br product taste great 


In [22]:
!pip install rouge


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [25]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

# Example generated and reference texts
generated_text = "great kcup"
reference_text = "hot cocoa kcup" 

# Initialize ROUGE
rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(generated_text, reference_text)

# Print ROUGE scores
print(scores)


[{'rouge-1': {'r': 0.3333333333333333, 'p': 0.5, 'f': 0.39999999520000007}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.3333333333333333, 'p': 0.5, 'f': 0.39999999520000007}}]
