In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd "/content/drive/MyDrive/Graduation Project/salah datasets/"
%ls

/content/drive/MyDrive/Graduation Project/salah datasets
'Copy of GP_Data.ipynb'   dialogsum.train.jsonl     [0m[01;36m'Graduation Project'[0m@
 dataset_train.json       dialogues_test.txt         Untitled0.ipynb
 dialogsum.dev.jsonl      dialogues_train.txt
 dialogsum.test.jsonl     dialogues_validation.txt


# Setting up the environment

In [3]:
import collections
import json
import os

import numpy as np
import pandas as pd
import requests

In [4]:
def get_filename_from_headers(url, headers):
    try:
        filename = headers["content-disposition"]
        idx = filename.find("=")
        filename = filename[idx+1:]
    except Exception as E:
        filename = url.split("/")[-1]

    return filename

def download_file(url, dest, override=False):
    res = requests.get(url)

    if (res.status_code != 200):
        print(f"Error! Couldn't download from this url={url}")
        return

    filename = get_filename_from_headers(url, res.headers)    
    filepath = os.path.join(dest, filename)

    if (not os.path.exists(filepath)) or (override):
        if (override):
            print(f"File '{filename}' exists! Overriding.. ", end="")
        else:
            print(f"Downloading '{filename}'.. ", end="")

        with open(filepath, "wb+") as fh:
            fh.write(res.content)
        print("Done!")

    else:
        print(f"File '{filename}' exists! Enable override to override it.")

    return filename

# Loading the first dataset

In [5]:
urls = ['https://raw.githubusercontent.com/cylnlp/dialogsum/main/DialogSum_Data/dialogsum.train.jsonl',
        'https://raw.githubusercontent.com/cylnlp/dialogsum/main/DialogSum_Data/dialogsum.test.jsonl',
        'https://raw.githubusercontent.com/cylnlp/dialogsum/main/DialogSum_Data/dialogsum.dev.jsonl',
]

files = []

for url in urls:
    files.append(download_file(url, "."))

%ls

File 'dialogsum.train.jsonl' exists! Enable override to override it.
File 'dialogsum.test.jsonl' exists! Enable override to override it.
File 'dialogsum.dev.jsonl' exists! Enable override to override it.
'Copy of GP_Data.ipynb'   dialogsum.train.jsonl     [0m[01;36m'Graduation Project'[0m@
 dataset_train.json       dialogues_test.txt         Untitled0.ipynb
 dialogsum.dev.jsonl      dialogues_train.txt
 dialogsum.test.jsonl     dialogues_validation.txt


In [6]:
def load_data(filename):
    data = []
    with open(filename, "r") as fh:
        lines = fh.read().splitlines()
        for line in lines:
            data.append(json.loads(line))

    return data

data = []
for filename in files:
    data += load_data(filename)

# Inspecting & cleaning the first dataset

In [7]:
len(data)

13460

In [8]:
keys = set()
for row in data:
    keys |= set(list(row.keys()))

keys = list(keys)
keys

['topic3',
 'summary',
 'summary2',
 'topic1',
 'fname',
 'dialogue',
 'summary3',
 'topic',
 'topic2',
 'summary1']

In [9]:
# just reordering
keys = ['topic', 
        'topic1',
        'topic2', 
        'topic3',
        'summary', 
        'summary1', 
        'summary2', 
        'summary3', 
        'dialogue', 
        'fname'
]

In [10]:
data[0]

{'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'fname': 'train_0',
 'summary': "Mr. Smi

In [11]:
data[-1000]

 'fname': 'test_0',
 'summary1': 'Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.',
 'summary2': 'In order to prevent employees from wasting time on Instant Message programs, #Person1# decides to terminate the use of those programs and asks Ms. Dawson to send out a memo to all employees by the afternoon.',
 'summary3': 'Ms. Dawson takes a dictation for #Person1# about prohibiting the use of Instant Message programs in the office. They argue about its reasonability but #Person1# still insists.',
 'topic1': 'communication method',
 'topic2': 'company policy',
 'topic3': 'dictation'}

In [12]:
data[-1]

{'dialogue': "#Person1#: I can't believe it's almost summer.\n#Person2#: Yeah, I know. The year went really fast.\n#Person1#: What are you going to do this summer vacation?\n#Person2#: I'm going to work for a company.\n#Person1#: A company? What's that? What do you do?\n#Person2#: We help at parties. Our company prepares and serves the food, and usually another company provides the music.\n#Person1#: I didn't know you could cook.\n#Person2#: I don't have to cook. I'm only an assistant.\n#Person1#: When do you start?\n#Person2#: Tomorrow. We're helping a birthday party. Then a big family get together.\n#Person1#: So, what exactly do you do?\n#Person2#: Before the party starts, I help set everything up. You know, bring the food in and arrange the tables, make sure it looks nice.\n#Person1#: Sounds pretty easy.\n#Person2#: That's only the first part. During the party, I have to serve food and drinks to the guests.\n#Person1#: Well, at least you get to meet people.\n#Person2#: Yeah, and af

In [13]:
list_of_data = [[d.get(key, np.nan) for key in keys] for d in data]

dataset1_df = pd.DataFrame(data=list_of_data)
dataset1_df.columns = keys
dataset1_df

Unnamed: 0,topic,topic1,topic2,topic3,summary,summary1,summary2,summary3,dialogue,fname
0,get a check-up,,,,"Mr. Smith's getting a check-up, and Doctor Haw...",,,,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...",train_0
1,vaccines,,,,Mrs Parker takes Ricky for his vaccines. Dr. P...,,,,"#Person1#: Hello Mrs. Parker, how have you bee...",train_1
2,find keys,,,,#Person1#'s looking for a set of keys and asks...,,,,"#Person1#: Excuse me, did you see a set of key...",train_2
3,have a girlfriend,,,,#Person1#'s angry because #Person2# didn't tel...,,,,#Person1#: Why didn't you tell me you had a gi...,train_3
4,dance,,,,Malik invites Nikki to dance. Nikki agrees if ...,,,,"#Person1#: Watsup, ladies! Y'll looking'fine t...",train_4
...,...,...,...,...,...,...,...,...,...,...
13455,the new year,,,,#Person1# decides to stop smoking and come out...,,,,"#Person1#: Now that it's the new year, I've de...",dev_495
13456,fall in love,,,,#Person1# thought #Person2# married Joe. #Pers...,,,,"#Person1#: You married Joe, didn't you? \n#Per...",dev_496
13457,noises,,,,#Person2#'s car makes noises. #Person1# thinks...,,,,#Person1#: How can I help you mam?\n#Person2#:...,dev_497
13458,a missing page,,,,#Person2# calls Amazon's customer service beca...,,,,"#Person1#: Hello, Amazon's customer service. H...",dev_498


In [14]:
dataset1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13460 entries, 0 to 13459
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   topic     12960 non-null  object
 1   topic1    500 non-null    object
 2   topic2    500 non-null    object
 3   topic3    500 non-null    object
 4   summary   12960 non-null  object
 5   summary1  500 non-null    object
 6   summary2  500 non-null    object
 7   summary3  500 non-null    object
 8   dialogue  13460 non-null  object
 9   fname     13460 non-null  object
dtypes: object(10)
memory usage: 1.0+ MB


In [15]:
dataset1_df.describe()

Unnamed: 0,topic,topic1,topic2,topic3,summary,summary1,summary2,summary3,dialogue,fname
count,12960,500,500,500,12960,500,500,500,13460,13460
unique,7695,449,450,457,12936,499,499,499,13458,13460
top,shopping,job interview,shopping,shopping,#Person1# thinks Tom is too old to be trick-or...,#Person1# and Mike are discussing what kind of...,#Person1# and Mike have a disagreement on how ...,"#Person1# wants Mike to act more angry, but Mi...",#Person1#: Any plans tonight? \n#Person2#: Not...,train_3064
freq,178,6,9,6,2,2,2,2,2,1


In [16]:
dataset1_df[dataset1_df['topic']=='shopping'].iloc[0]['dialogue']

'#Person1#: Ten sheets of rice paper, 25 brushes, two boxes of oil color and two boxes of water color. All these come up to $ 35. 50, sir.\n#Person2#: Ok, here is $ 50. Oh, can you make out an invoice for me?\n#Person1#: Sure, just a minute. Are you an artist, sir?\n#Person2#: No, I am a teacher. I teach art.\n#Person1#: That must be a very interesting job.\n#Person2#: It is. You must be new here. I do my shopping here regularly, once a week.\n#Person1#: Do you? Nice to meet you! And here is the invoice and your change.\n#Person2#: Thank you. Nice to meet you, too.'

In [17]:
dataset1_df[dataset1_df['topic']=='shopping'].iloc[1]['dialogue'].replace("#Person1#:", "").replace("#Person2#:", "")

" What is your policy on returns?\n If you bring them back with your receipt within seven days, you'll get a full refund. Also, if you need alterations to the pants, just bring them in and we'll do them for free.\n Sounds good.\n That'll be $ 70, please. Will you be paying by cash or charge?\n Charge. Here's my card.\n Okay, sir, here you are. Thank you very much.\n Thank you."

# Loading the second dataset

In [18]:
files2 = ["dialogues_test.txt", "dialogues_train.txt", "dialogues_validation.txt"]

data2 = []
for filename in files2:
    with open(filename, "r") as fh:
          lines = fh.read().splitlines()
          for line in lines:
            data2.append(line)

len(data)

13460

In [19]:
data2[5]

'how long will it take us to drive to London ? __eou__ I think it ’ s a distance of 180 kilometers from here to London , so it should be a two-hour drive on the motorway . __eou__ that ’ s unless there is a traffic jam . It could take three hours . __eou__ you ’ re right . We will be able to travel at high speeds at the beginning and end of the journey , because we will be in built-up areas . __eou__ so , shall we allow three hours to cover the distance ? __eou__ ok . You haven ’ t seen my company car , have you ? __eou__ no . let me take a look ... it ’ s longer than my car . __eou__ I think it ’ s over five meters long . I can ’ t remember exactly . It has a maximum speed of over 200 kilometers an hour . __eou__ wow ! That ’ s fast ! I don ’ t think we will be traveling that fast on the motorway . __eou__ we can ’ t . if we went that fast , we would break the speed limit . __eou__'

In [20]:
lines = [line.strip() for line in data2[5].split('__eou__')]
lines

['how long will it take us to drive to London ?',
 'I think it ’ s a distance of 180 kilometers from here to London , so it should be a two-hour drive on the motorway .',
 'that ’ s unless there is a traffic jam . It could take three hours .',
 'you ’ re right . We will be able to travel at high speeds at the beginning and end of the journey , because we will be in built-up areas .',
 'so , shall we allow three hours to cover the distance ?',
 'ok . You haven ’ t seen my company car , have you ?',
 'no . let me take a look ... it ’ s longer than my car .',
 'I think it ’ s over five meters long . I can ’ t remember exactly . It has a maximum speed of over 200 kilometers an hour .',
 'wow ! That ’ s fast ! I don ’ t think we will be traveling that fast on the motorway .',
 'we can ’ t . if we went that fast , we would break the speed limit .',
 '']

In [21]:
lines = [line for line in lines if line]
lines

['how long will it take us to drive to London ?',
 'I think it ’ s a distance of 180 kilometers from here to London , so it should be a two-hour drive on the motorway .',
 'that ’ s unless there is a traffic jam . It could take three hours .',
 'you ’ re right . We will be able to travel at high speeds at the beginning and end of the journey , because we will be in built-up areas .',
 'so , shall we allow three hours to cover the distance ?',
 'ok . You haven ’ t seen my company car , have you ?',
 'no . let me take a look ... it ’ s longer than my car .',
 'I think it ’ s over five meters long . I can ’ t remember exactly . It has a maximum speed of over 200 kilometers an hour .',
 'wow ! That ’ s fast ! I don ’ t think we will be traveling that fast on the motorway .',
 'we can ’ t . if we went that fast , we would break the speed limit .']

In [22]:
data2 = [[line.strip() for line in dialogue.split('__eou__') if line] for dialogue in data2]
data2 = [[f"#Person{i%2+1}#:{line}" for i, line in enumerate(dialogue)] for dialogue in data2]
data2 = ['\n'.join(dialogue) for dialogue in data2]
data2[:4]

['#Person1#:Hey man , you wanna buy some weed ?\n#Person2#:Some what ?\n#Person1#:Weed ! You know ? Pot , Ganja , Mary Jane some chronic !\n#Person2#:Oh , umm , no thanks .\n#Person1#:I also have blow if you prefer to do a few lines .\n#Person2#:No , I am ok , really .\n#Person1#:Come on man ! I even got dope and acid ! Try some !\n#Person2#:Do you really have all of these drugs ? Where do you get them from ?\n#Person1#:I got my connections ! Just tell me what you want and I ’ ll even give you one ounce for free .\n#Person2#:Sounds good ! Let ’ s see , I want .\n#Person1#:Yeah ?\n#Person2#:I want you to put your hands behind your head ! You are under arrest !',
 '#Person1#:The taxi drivers are on strike again .\n#Person2#:What for ?\n#Person1#:They want the government to reduce the price of the gasoline .\n#Person2#:It is really a hot potato .',
 "#Person1#:We've managed to reduce our energy consumption in our factory by about 15 per cent in the last two years .\n#Person2#:That's excel

In [23]:
dataset2_df = pd.DataFrame(data=data2)
dataset2_df.columns = ['dialogue']

dataset2_df

Unnamed: 0,dialogue
0,"#Person1#:Hey man , you wanna buy some weed ?\..."
1,#Person1#:The taxi drivers are on strike again...
2,#Person1#:We've managed to reduce our energy c...
3,"#Person1#:Believe it or not , tea is the most ..."
4,#Person1#:What are your personal weaknesses ?\...
...,...
13113,"#Person1#:Hello , who is speaking ?\n#Person2#..."
13114,#Person1#:Ahh ... What a fine day ! I do feel ...
13115,"#Person1#:I'm so sorry about your brother , Mr..."
13116,"#Person1#:Hi , Jeny.Are still working ?\n#Pers..."


# Merging the two datasets

In [24]:
dataset_df = dataset1_df.append(dataset2_df)

dataset_df

Unnamed: 0,topic,topic1,topic2,topic3,summary,summary1,summary2,summary3,dialogue,fname
0,get a check-up,,,,"Mr. Smith's getting a check-up, and Doctor Haw...",,,,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...",train_0
1,vaccines,,,,Mrs Parker takes Ricky for his vaccines. Dr. P...,,,,"#Person1#: Hello Mrs. Parker, how have you bee...",train_1
2,find keys,,,,#Person1#'s looking for a set of keys and asks...,,,,"#Person1#: Excuse me, did you see a set of key...",train_2
3,have a girlfriend,,,,#Person1#'s angry because #Person2# didn't tel...,,,,#Person1#: Why didn't you tell me you had a gi...,train_3
4,dance,,,,Malik invites Nikki to dance. Nikki agrees if ...,,,,"#Person1#: Watsup, ladies! Y'll looking'fine t...",train_4
...,...,...,...,...,...,...,...,...,...,...
13113,,,,,,,,,"#Person1#:Hello , who is speaking ?\n#Person2#...",
13114,,,,,,,,,#Person1#:Ahh ... What a fine day ! I do feel ...,
13115,,,,,,,,,"#Person1#:I'm so sorry about your brother , Mr...",
13116,,,,,,,,,"#Person1#:Hi , Jeny.Are still working ?\n#Pers...",


In [26]:
dataset_df.to_csv("merged_dataset.csv")
%ls

'Copy of GP_Data.ipynb'   dialogsum.train.jsonl     [0m[01;36m'Graduation Project'[0m@
 dataset_train.json       dialogues_test.txt         merged_dataset.csv
 dialogsum.dev.jsonl      dialogues_train.txt        Untitled0.ipynb
 dialogsum.test.jsonl     dialogues_validation.txt
