In [2]:
from google.colab import drive
drive.mount('/content/drive')

%cd "/content/drive/MyDrive/DataEngineering"
%ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/DataEngineering
[0m[01;34m'API Keys'[0m/   [01;34mDatasets[0m/   [01;34mMergedDataset[0m/   [01;34mNotebooks[0m/  [01;34m'OpenAI Samples'[0m/


In [7]:
import glob
import os
import random
import json
from pprint import pprint

import pandas as pd
import numpy as np

# local module
import Notebooks.helperFunctions as helperFunctions

In [4]:
SEED = 512

In [5]:
get_max_category = lambda d: list(sorted(d.items(), reverse=True))[0][0]

def get_classification(path_to_history, df, dtype='U60'):
    topics = np.array(["unclassified"]*df.shape[0], dtype=dtype)

    if os.path.exists(path_to_history):
        history = helperFunctions.read_json(path_to_history)
        classified_from_history = np.array(list(history.keys())).astype(int)
        classified_from_history_values = np.array(list(map(get_max_category, 
                                                           history.values())))
        topics[classified_from_history] = classified_from_history_values

    return topics

In [6]:
i = 1
df = pd.read_csv(f"Datasets/dataset{i}/df{i}.csv")
path_to_history = f"Datasets/dataset{i}/classification_history{i}.json"
df['Google Classification'] = get_classification(path_to_history, df)


mask  = df['Google Classification']!='unknown'
mask &= df['Google Classification']!='error'
mask &= df['Google Classification']!='unclassified'

df[mask]['Google Classification'].apply(lambda x: x.split("/")[1]).value_counts()

Arts & Entertainment       1633
Food & Drink                741
Jobs & Education            475
Travel                      442
Shopping                    351
Business & Industrial       304
Sports                      289
People & Society            288
Finance                     210
Health                      207
Beauty & Fitness            176
Home & Garden               171
Hobbies & Leisure           146
Autos & Vehicles            127
Online Communities          117
Law & Government            111
News                        100
Real Estate                  96
Computers & Electronics      88
Books & Literature           79
Pets & Animals               77
Reference                    75
Sensitive Subjects           64
Games                        60
Internet & Telecom           55
Science                      49
Name: Google Classification, dtype: int64

In [24]:
random.seed(SEED)

datasets_info = [helperFunctions.read_json(f) for f in glob.glob("Datasets/dataset*/info*.json")]
datasets_info.sort(key=lambda x: x['order'])

dataset_names = [datasets_info[i]['name'] for i in range(len(datasets_info))]

dataset_names = ['DialogSum', 
                 'DailyDialog', 
                 'Cornell Movie--Dialogs Corpus',
                 'Commonsense-Dialogues',
                 'EmpatheticDialogues',
                 'MultiWOZ 2.2',
                 'Taskmaster-2']

openAI_sample = []

lookup_category = 'Arts & Entertainment'
lookup_category = 'Food & Drink'

for i, name in enumerate(dataset_names, 1):
    print(f"Getting samples from '{name}'... ")
    df = pd.read_csv(f"Datasets/dataset{i}/df{i}.csv")
    path_to_history = f"Datasets/dataset{i}/classification_history{i}.json"
    df['Google Classification'] = get_classification(path_to_history, df)

    df = df[['dialogue', 'Google Classification']]

    mask = df['Google Classification']!='unknown'
    mask &= df['Google Classification']!='error'
    mask &= df['Google Classification']!='unclassified'

    classes = df[mask]['Google Classification'].apply(lambda x: str(x).split("/")[1])

    print(f"Found {len(classes)} classified results!")

    category_classes = classes[classes==lookup_category]
    category_dialogues = df.iloc[category_classes.index]
    n_classified = len(category_classes)
    print(f"Found {len(category_classes)} results classified with '{lookup_category}'!")

    sample_size = 80

    if sample_size>n_classified:
        print(f"The sample size {sample_size} is bigger than the # classified results!")
        print(f"The sample size will be limited down to {n_classified} from this dataset.")

        sample_size = n_classified

    sample_rows = random.sample(range(n_classified), sample_size)
    sample_dialogues = category_dialogues.iloc[sample_rows]

    openAI_sample += sample_dialogues['dialogue'].to_list()
    print()

Getting samples from 'DialogSum'... 
Found 6531 classified results!
Found 741 results classified with 'Food & Drink'!

Getting samples from 'DailyDialog'... 
Found 5310 classified results!
Found 619 results classified with 'Food & Drink'!

Getting samples from 'Cornell Movie--Dialogs Corpus'... 
Found 22676 classified results!
Found 385 results classified with 'Food & Drink'!

Getting samples from 'Commonsense-Dialogues'... 
Found 3234 classified results!
Found 256 results classified with 'Food & Drink'!

Getting samples from 'EmpatheticDialogues'... 
Found 7512 classified results!
Found 475 results classified with 'Food & Drink'!

Getting samples from 'MultiWOZ 2.2'... 
Found 7369 classified results!
Found 2854 results classified with 'Food & Drink'!

Getting samples from 'Taskmaster-2'... 
Found 14966 classified results!
Found 3535 results classified with 'Food & Drink'!



In [25]:
len(openAI_sample)

560

In [26]:
random.seed(SEED)
def preprocess_dialogue(dialogue):
    """
        Splits each dialouge into a prompt and a completion samples.
        Returns a list of dictionaries.
        
        # {"prompt": "<prompt text>", "completion": "<ideal generated text>"}

    """
    turns = dialogue.split("\n")

    jsonl = []
    prev = ""
    for turn in turns:
        # remove '#Person1#: ' and '#Person2#: ' 
        # turn = turn[11:]
        # r = random.randrange(10, 50)
        r = 1
        for i in range(1, len(turn), r):
            turn_dict = {"prompt": prev + turn[:i], 
                         "completion": turn}

            jsonl.append(turn_dict)

        prev += turn + "\n"
    return jsonl

preprocessed_openAI_sample = sum([preprocess_dialogue(d) for d in openAI_sample], [])
len(preprocessed_openAI_sample)

337607

In [27]:
openAI_sample[0].split("\n")

['#Person1#: I have noticed that a lot of people around here are very healthy and hard working. I thought all Americans just ate hamburgers and pizza and sat in front of the TV all day long.',
 '#Person2#: Huh... well, our culture and society has become a lot more health conscious than it was 10 years ago.',
 '#Person1#: I have heard a lot of people talking about organic foods and even growing their own vegetables.',
 '#Person2#: Yes, things are different now. I think most people are just more aware than before whether it is about themselves or the world. There is so much information out there. People are less ignorant than they were before.',
 '#Person1#: More attention has been drawn toward protecting the environment.',
 "#Person2#: I agree, but there are also many problems that we still need to overcome. Let's not get into that.",
 '#Person1#: Focus on the positive, right? So what do you do to play your part in a healthy society?',
 '#Person2#: I try to avoid driving my car as much 

In [28]:
preprocess_dialogue(openAI_sample[0])[:10]

[{'completion': '#Person1#: I have noticed that a lot of people around here are very healthy and hard working. I thought all Americans just ate hamburgers and pizza and sat in front of the TV all day long.',
  'prompt': '#'},
 {'completion': '#Person1#: I have noticed that a lot of people around here are very healthy and hard working. I thought all Americans just ate hamburgers and pizza and sat in front of the TV all day long.',
  'prompt': '#P'},
 {'completion': '#Person1#: I have noticed that a lot of people around here are very healthy and hard working. I thought all Americans just ate hamburgers and pizza and sat in front of the TV all day long.',
  'prompt': '#Pe'},
 {'completion': '#Person1#: I have noticed that a lot of people around here are very healthy and hard working. I thought all Americans just ate hamburgers and pizza and sat in front of the TV all day long.',
  'prompt': '#Per'},
 {'completion': '#Person1#: I have noticed that a lot of people around here are very healt

In [29]:
final_sample = preprocessed_openAI_sample

final_sample = random.sample(preprocessed_openAI_sample, 10_000)

obj = {"sample":final_sample}

os.makedirs("OpenAI Samples", exist_ok=True)

with open(f"OpenAI Samples/OpenAI_sample[{lookup_category}].json", "w+") as fp:
    json.dump(obj, fp)