In [1]:
%pip install pandas




In [2]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
def read_ROCstories(csv_path):
    """
    Read a CSV file containing ROCstories dataset with stories spread across columns, 
    and concatenate them

    Parameters:
    - csv_path: Path to the CSV file containing the stories.
    - output_path: Path to save the formatted text file.
    - delimiter: A string delimiter to separate stories. Defaults to "<|endoftext|>".
    """
    
    df = pd.read_csv(csv_path)
    
    stories = df[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5']].agg(' '.join, axis=1)

    return stories

In [4]:
def format_and_save(stories_array, output_path, delimiter="<|endoftext|>"):
    all_stories = (delimiter + "\n").join(stories_array)
    
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(all_stories)

In [5]:
def format_wrap_and_save(stories_array, output_path, delimiter=""):
    # start_token = "<s>"
    start_token = ""
    # end_token = "</s>"
    end_token = ""

    formatted_stories = [f"{start_token}{story}{end_token}" for story in stories_array]
    all_stories = (delimiter + "\n").join(formatted_stories)
    
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(all_stories)

In [6]:
# Reading both ROCStories spring 2016 and winter 2017 datasets and combining them
datasets = ['ROCStories_spring2016', 'ROCStories_winter2017']

all_stories = []
for dataset_name in datasets:
    csv_path = './datasets/raw/'+dataset_name+'.csv'
    output_path = './datasets/formatted/'+dataset_name+'.txt'
    read_stories = read_ROCstories(csv_path)
    all_stories+=np.array(read_stories).tolist()

In [7]:
# Shuffling stories and splitting into training and testing data
train_test_ratio = 0.8

stories_array = np.array(all_stories)
np.random.shuffle(stories_array)

split_index = int(len(stories_array) * train_test_ratio)

train_data = stories_array[:split_index].tolist()
test_data = stories_array[split_index:].tolist()

In [8]:
import os

directory_path = "./datasets/formatted/ROCStories"

if not os.path.exists(directory_path):
    os.makedirs(directory_path)
    print("Directory created:", directory_path)

# Saving training and testing dataset
format_wrap_and_save(train_data, directory_path + "/train_uf.txt")
format_wrap_and_save(test_data, directory_path + "/test_uf.txt")