In [None]:
import json
import csv

# Input and output file paths
input_file = "humarin-chatgpt-paraphrases.txt"
output_file = "output.csv"

# Open the input and output files
with open(input_file, 'r') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    # Initialize CSV writer
    csv_writer = csv.writer(outfile)
    
    # Write the header row
    csv_writer.writerow(["Prompt", "Completion"])
    
    # Process each line in the input file
    for line in infile:
        try:
            # Parse the JSON object
            data = json.loads(line.strip())
            # Write the values to the CSV
            csv_writer.writerow([data["prompt"], data["completion"]])
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON line: {line}")
        except KeyError:
            print(f"Skipping line with missing keys: {line}")


In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('output.csv')

In [None]:
# Create a new column 'group' to divide the rows into blocks of 5
df['group'] = np.floor(df.index / 5)

# Randomly select one row from each group
filtered_df = df.groupby('group').apply(lambda x: x.sample(1)).reset_index(drop=True)

filtered_df.head()

In [None]:
shuffled_df = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True)
shuffled_df.head()

In [None]:
# Ensure that the shuffled completions do not match the original prompt's row
# Shift the shuffled dataframe to avoid matching the same row
shuffled_df['Completion'] = shuffled_df['Completion'].shift(-1)

# Drop the last row as it will have a NaN completion after the shift
shuffled_df = shuffled_df.dropna()

# Create a new DataFrame with 'prompt' and 'completion' columns
new_df = pd.DataFrame({
    'Prompt': filtered_df['Prompt'],
    'Completion': shuffled_df['Completion'],
    'label': 0
})

In [None]:
new_df

In [None]:
filtered_df['label'] = 1
filtered_df.drop(columns=['group'], inplace=True)
filtered_df

In [None]:
df_extended = pd.concat([filtered_df, new_df], ignore_index=True)
df_extended

In [None]:
df.info()

In [None]:
df2 = pd.read_parquet('train-00000-of-00001.parquet')
df2.head()

In [None]:
df2.drop(columns=['text'], inplace=True)
df2['claims'] = df2['claims'].apply(lambda x: str(x)
                                    .replace('[', '')
                                    .replace(']', '')
                                    .replace("'", "")
                                    .replace('"', '')
                                    .replace('\n', ''))
df2

In [None]:
import matplotlib.pyplot as plt

df_extended.groupby('label').size().plot(kind='pie', autopct='%.2f')

In [None]:
# Create a new column 'group' to divide the rows into blocks of 5
df2['group'] = np.floor(df2.index / 2)

# Randomly select one row from each group
filtered_df2 = df2.groupby('group').apply(lambda x: x.sample(1)).reset_index(drop=True)
shuffled_df2 = filtered_df2.sample(frac=1, random_state=42).reset_index(drop=True)
# Ensure that the shuffled completions do not match the original prompt's row
# Shift the shuffled dataframe to avoid matching the same row
shuffled_df2['paraphrase'] = shuffled_df2['paraphrase'].shift(-1)

# Drop the last row as it will have a NaN paraphrase after the shift
shuffled_df2 = shuffled_df2.dropna()

# Create a new DataFrame with 'prompt' and 'paraphrase' columns
new_df2 = pd.DataFrame({
    'Prompt': filtered_df2['claims'],
    'Completion': shuffled_df2['paraphrase'],
    'label': 0
})

new_df2

In [None]:
df2.drop(columns=['title', 'group'], inplace=True)
df2.rename(columns={'claims' : 'Prompt','paraphrase': 'Completion'}, inplace=True)
df2['label'] = 1
df2

In [None]:
df_extended2 = pd.concat([df2, new_df2], ignore_index=True)
df_extended2

In [None]:
df_extended2.groupby('label').size().plot(kind='pie', autopct='%.2f')

In [None]:
combined_data = pd.concat([df_extended, df_extended2], ignore_index=True) 
combined_data.dropna(inplace=True)
combined_data

In [None]:
combined_data.groupby('label').size().plot(kind='pie', autopct='%.2f')

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# Step 1: Load Your Dataset
# Replace 'your_dataset.csv' with the path to your dataset file
dataset_path = 'combined_data.csv'
df = pd.read_csv(dataset_path)

# Check if the dataset has the required columns
assert all(col in df.columns for col in ['text_1', 'text_2', 'label']), "Dataset must have 'text_1', 'text_2', and 'label' columns."

# Step 2: Convert Dataset into InputExample Format
train_data = [
    InputExample(texts=[row['text_1'], row['text_2']], label=float(row['label']))
    for _, row in df.iterrows()
]

# Step 3: Load Pretrained Sentence Transformer
model = SentenceTransformer('paraphrase-mpnet-base-v2')  # You can choose any suitable pre-trained model

# Step 4: Prepare the DataLoader
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)  # Adjust batch_size as needed

# Step 5: Define the Loss Function
train_loss = losses.CosineSimilarityLoss(model)

# Step 6: Fine-Tune the Model
output_path = './output/sentence-transformer-plagiarism-model'  # Path to save the fine-tuned model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=4,  # Adjust epochs as needed
    warmup_steps=100,  # Adjust warmup steps as needed
    output_path=output_path
)

print(f"Fine-tuned model saved to: {output_path}")


  0%|          | 0/234884 [00:00<?, ?it/s]