In [1]:
#@title # Setting up the environment { vertical-output: true, display-mode: "form" }

###################
#####  SETUP  #####
###################

#@title Setting up project paths
import os

colab_setup = False #@param {type:"boolean"}
PROJECT_PATH = "/content/drive/MyDrive/TWM/Graduation-Project/" #@param {"type":"string"}

if colab_setup:
    from google.colab import drive
    print("Mounting Google Drive...", end="", flush=True)
    drive.mount('/content/drive')
    print("Done")

# else:
#     # set this to the parent directory of the whole project
#     PROJECT_PATH = rf"C:\Users\{os.environ['USERNAME']}\Graduation-Project"

print("PROJECT_PATH:", PROJECT_PATH)
os.chdir(PROJECT_PATH)
os.listdir()

PATH_TO_DATA = "DataEngineering/FinalDataset/large/"

import pandas as pd

splits = ['train.csv', 'dev.csv', 'test.csv']
splits = [PATH_TO_DATA+split_name  for split_name in splits]

df = pd.concat(list(map(pd.read_csv, splits)), axis=0).drop(columns=["Unnamed: 0"])

TOPICS = df.groupby('topic').dialogue.count().sort_values(ascending=False).index
COLS = ['topic', 'subtopic1', 'subtopic2']

def print_dialogues(df, categorized_df, n_topics_per_row=3):
    for i in range(len(categorized_df)):
        topic, subtopic1, subtopic2 = categorized_df.index[i]
        topic_mask = ((df.topic==topic)& 
                      (df.subtopic1==subtopic1)& 
                      (df.subtopic2==subtopic2))
        
        dialogues = df[topic_mask].dialogue.iloc[:n_topics_per_row]
        for dialogue in dialogues:
            print(f"Topic    : {topic}, \nSubtopic1: {subtopic1}, \nSubtopic2: {subtopic2}\n")
            print(dialogue)
            print("--"*20)
        
        print("++"*60)

print(df.dataset.unique())
df

PROJECT_PATH: C:\Users\LAPTOP\Graduation-Project
['Schema-Guided Dialogue' 'MultiWOZ 2.2' 'Taskmaster-2'
 'Cornell Movie--Dialogs Corpus' 'MetaLWOZ' 'Taskmaster-3' 'Taskmaster-1'
 'MSR-E2E' 'EmpatheticDialogues' 'DialogSum' 'DailyDialog'
 'Commonsense-Dialogues']


Unnamed: 0,dialogue,dataset,topic,subtopic1,subtopic2
0,#Person1#:Find a general practitioner.\r\n#Per...,Schema-Guided Dialogue,Food & Drink,Restaurants,unknown
1,#Person1#:I'm looking for a restaurant in the ...,MultiWOZ 2.2,Food & Drink,Restaurants,unknown
2,#Person1#:I'd like help finding a hotel in Nas...,Taskmaster-2,Travel,Hotels & Accommodations,unknown
3,#Person1#: Do you know somebody called the The...,Cornell Movie--Dialogs Corpus,Arts & Entertainment,Music & Audio,unknown
4,#Person1#:I'm looking for a British restaurant...,MultiWOZ 2.2,Food & Drink,Restaurants,unknown
...,...,...,...,...,...
18499,#Person1#: Rangers... I'm afraid you're too l...,Cornell Movie--Dialogs Corpus,Arts & Entertainment,unknown,unknown
18500,"#Person1#:What are the movie times for ""No Tim...",Taskmaster-3,Arts & Entertainment,Movies,unknown
18501,#Person1#: It isn't a secret. She plays with m...,Cornell Movie--Dialogs Corpus,Arts & Entertainment,unknown,unknown
18502,#Person1#:Hey there. I would like to see a mov...,Taskmaster-3,Arts & Entertainment,Movies,unknown


In [2]:
#@title Environment Watermark
%load_ext watermark
%watermark --author "Mohamed Hisham" --email "Mohamed00Hisham@gmail.com" --github_username "Mhmd-Hisham"
%watermark
%watermark --iversions

Author: Mohamed Hisham

Github username: Mhmd-Hisham

Email: Mohamed00Hisham@gmail.com

Last updated: 2022-09-28T14:53:57.478483+02:00

Python implementation: CPython
Python version       : 3.9.5
IPython version      : 8.5.0

Compiler    : MSC v.1928 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 165 Stepping 2, GenuineIntel
CPU cores   : 12
Architecture: 64bit

pandas: 1.5.0
sys   : 3.9.5 (tags/v3.9.5:0a7dcbd, May  3 2021, 17:27:52) [MSC v.1928 64 bit (AMD64)]



# Arts & Entertainment

In [3]:
#@markdown Subtopics
idx = 0

categorized_df = df[df.topic==TOPICS[idx]].groupby(COLS)[['dialogue']].count().sort_values(by='dialogue',ascending=False)
categorized_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dialogue
topic,subtopic1,subtopic2,Unnamed: 3_level_1
Arts & Entertainment,Movies,unknown,30758
Arts & Entertainment,unknown,unknown,26684
Arts & Entertainment,Music & Audio,unknown,5364
Arts & Entertainment,Humor,unknown,1013
Arts & Entertainment,Music & Audio,Rock Music,888
Arts & Entertainment,Events & Listings,unknown,769
Arts & Entertainment,TV & Video,TV Shows & Programs,399
Arts & Entertainment,Performing Arts,unknown,283
Arts & Entertainment,Music & Audio,Urban & Hip-Hop,259
Arts & Entertainment,TV & Video,unknown,226


In [4]:
#@markdown Dialogues

print_dialogues(df, categorized_df, 3)

Topic    : Arts & Entertainment, 
Subtopic1: Movies, 
Subtopic2: unknown

#Person1#:I would really like to go to the movies tonight
#Person2#:Sure. What movie would you like to see?
#Person1#:Not sure what I want to watch, do you know what movies are currently playing in my city?
#Person2#:No problem. Can you tell me what city you're in?
#Person1#:Sure, New York
#Person2#:Ok. There are a couple of popular films out in your area: No Time To Die and Emma. Do either of those sound interesting?
#Person1#:I've watched the movie No Time To Die, but I did not watch Emma, can you tell me more about the film?
#Person2#:OK. Here's the description of Emma: In this adaptation of Jane Austen's classic novel, pretty socialite Emma Woodhouse, played by Gwyneth Paltrow, entertains herself by playing matchmaker for those around her.
#Person1#:The movie sounds interesting, I want to watch it, can you please give me two tickets to watch Emma/
#Person2#:OK. Emma is playing at AMC Mercado 24 today at 5:00p

# Food & Drink

In [5]:
#@markdown Subtopics
idx = 1
categorized_df = df[df.topic==TOPICS[idx]].groupby(COLS)[['dialogue']].count().sort_values(by='dialogue',ascending=False)
categorized_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dialogue
topic,subtopic1,subtopic2,Unnamed: 3_level_1
Food & Drink,Restaurants,unknown,10885
Food & Drink,unknown,unknown,3893
Food & Drink,Restaurants,Pizzerias,2156
Food & Drink,Beverages,Coffee & Tea,1660
Food & Drink,Cooking & Recipes,unknown,1488
Food & Drink,Food,unknown,400
Food & Drink,Restaurants,Fast Food,322
Food & Drink,Beverages,unknown,319
Food & Drink,Food,Meat & Seafood,214
Food & Drink,Beverages,Alcoholic Beverages,184


In [6]:
#@markdown Dialogues

print_dialogues(df, categorized_df, 3)


Topic    : Food & Drink, 
Subtopic1: Restaurants, 
Subtopic2: unknown

#Person1#:Find a general practitioner.
#Person2#:Where?
#Person1#:In Napa.
#Person2#:I found 9. Alex De Moraes, MD is in Napa and a general practitioner.
#Person1#:Ok.
#Person2#:Do you want to visit them?
#Person1#:No, get a cab there now for 1, to 935 Trancas Street.
#Person2#:Ride type?
#Person1#:Regular.
#Person2#:Please confirm: regular ride for 1 to 935 Trancas Street.
#Person1#:Yes.
#Person2#:Your ride is on its way.
#Person1#:What's the cost and when will it arrive?
#Person2#:It's $23.54 and will arrive in 7 minutes.
#Person1#:Thanks, find a restaurant near there.
#Person2#:Cuisine? Do you like Mexican, Chinese, something else?
#Person1#:Mexican is good.
#Person2#:Don Perico Mexican Restaurant is in Napa.
#Person1#:Good, make a reservation for 17:15 on the 13th of March.
#Person2#:Please confirm: table for 2 at Don Perico Mexican restaurant in Napa at 5:15 pm on March 13th.
#Person1#:Make it 7 pm for 1.
#Pers

# Travel

In [7]:
#@markdown Subtopics
idx = 2
categorized_df = df[df.topic==TOPICS[idx]].groupby(COLS)[['dialogue']].count().sort_values(by='dialogue',ascending=False)
categorized_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dialogue
topic,subtopic1,subtopic2,Unnamed: 3_level_1
Travel,unknown,unknown,7440
Travel,Air Travel,unknown,5919
Travel,Hotels & Accommodations,unknown,4449
Travel,Bus & Rail,unknown,983
Travel,Car Rental & Taxi Services,unknown,924
Travel,Tourist Destinations,unknown,458
Travel,Tourist Destinations,Theme Parks,172
Travel,Tourist Destinations,Beaches & Islands,80
Travel,Tourist Destinations,Mountain & Ski Resorts,31
Travel,Tourist Destinations,Zoos-Aquariums-Preserves,20


In [8]:
#@markdown Dialogues

print_dialogues(df, categorized_df, 3)


Topic    : Travel, 
Subtopic1: unknown, 
Subtopic2: unknown

#Person1#:I would like to find a bus.
#Person2#:Where are you leaving from and going? How many tickets would you like?
#Person1#:I'm leaving from Anaheim, CA going to Sacramento, we are four people.
#Person2#:When do you want to leave and at what time?
#Person1#:I would like to leave on the 2nd of March at a quarter to 2 in the afternoon.
#Person2#:Booking a ticket on bus leaving from Anaheim to Sacramento on March 2nd at 1:45 pm for 4 people.
#Person1#:Yes, that is correct.
#Person2#:I was unable to book it for that time, would you like to leave at 1:50 pm for the cost of $58?
#Person1#:Yes, that works for me.
#Person2#:Your ticket has been purchased.
#Person1#:How many transfers does the trip have and what is the station I am leaving from?
#Person2#:There is 1 transfer and you are leaving from Anaheim Intermodal Center.
#Person1#:Can you find me a compact car in Sacramento?
#Person2#:Do you want the car on March 2nd and at 

# Sports

In [9]:
#@markdown Subtopics
idx = 3
categorized_df = df[df.topic==TOPICS[idx]].groupby(COLS)[['dialogue']].count().sort_values(by='dialogue',ascending=False)
categorized_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dialogue
topic,subtopic1,subtopic2,Unnamed: 3_level_1
Sports,Team Sports,Baseball,1346
Sports,Team Sports,Soccer,1072
Sports,Team Sports,unknown,1024
Sports,unknown,unknown,879
Sports,Team Sports,Basketball,560
Sports,Team Sports,American Football,495
Sports,Sporting Goods,unknown,196
Sports,Individual Sports,unknown,167
Sports,Individual Sports,Cycling,101
Sports,Winter Sports,Skiing & Snowboarding,68


In [10]:
#@markdown Dialogues

print_dialogues(df, categorized_df, 3)


Topic    : Sports, 
Subtopic1: Team Sports, 
Subtopic2: Baseball

#Person1#:Hi.
#Person2#:What can I do for you?
#Person1#:I just want you to know that I'm an English Premier soccer fan and my favorite team is Crystal Palace. I would like to know how is Crystal Palace doing? What place are they in?
#Person2#:Unfortunately, Crystal Palace is in 19th Place, they are in the second last place in the Premier League standings.
#Person1#:Oh, no. That is not good news. Can you tell me if they're playing right now?
#Person2#:Sure, they are not playing right now but the next game they will be playing against the Burnley F.C on September 10th at 5:30 AM.
#Person1#:Okay, great. How did they do in their last game?
#Person2#:In their last game, they actually lost against Swansea City FC.
#Person1#:I'm happy to hear about that. I would like to know, who is their striker?
#Person2#:Their Striker is Christian benteke.
#Person1#:Great, thank you very much.
#Person2#:Have a great day.
#Person1#:You, too.

# People & Society

In [11]:
#@markdown Subtopics
idx = 4
categorized_df = df[df.topic==TOPICS[idx]].groupby(COLS)[['dialogue']].count().sort_values(by='dialogue',ascending=False)
categorized_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dialogue
topic,subtopic1,subtopic2,Unnamed: 3_level_1
People & Society,Religion & Belief,unknown,1163
People & Society,unknown,unknown,691
People & Society,Family & Relationships,Marriage,486
People & Society,Family & Relationships,unknown,481
People & Society,Family & Relationships,Family,203
People & Society,Kids & Teens,Children's Interests,160
People & Society,Family & Relationships,Troubled Relationsh,101
People & Society,Social Issues & Advocacy,unknown,44
People & Society,Subcultures & Niche Interests,unknown,28
People & Society,Social Issues & Advocacy,Charity & Philant,26


In [12]:
#@markdown Dialogues

print_dialogues(df, categorized_df, 3)


Topic    : People & Society, 
Subtopic1: Religion & Belief, 
Subtopic2: unknown

#Person1#:I was joyful because my grandad shared the gospel with me when I was 11.
#Person2#:You mean nostalgic?
#Person1#:I guess I am nostalgic now.
#Person2#:What your grand father did to you was very special.
#Person1#:He read a tract to me. He explained God's love for me with pictures in a way I could understand.
----------------------------------------
Topic    : People & Society, 
Subtopic1: Religion & Belief, 
Subtopic2: unknown

#Person1#: I'm out of prac...  ... practice.
#Person2#: All right?
#Person1#: Fine thanks...
#Person2#: Do you suppose the saints would have smoked if tobacco had been popular back then?
#Person1#: Undoubtedly. Not the ascetics of course but, well Saint Thomas More...
#Person2#: Long, thin and filtered.
#Person1#: Saint Ignatius would smoke cigars and stub them out on the soles of his bare feet.  And of course
#Person2#: Hand rolled.
#Person1#: Even Christ would partake so

# Autos & Vehicles

In [13]:
#@markdown Subtopics
idx = 5
categorized_df = df[df.topic==TOPICS[idx]].groupby(COLS)[['dialogue']].count().sort_values(by='dialogue',ascending=False)
categorized_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dialogue
topic,subtopic1,subtopic2,Unnamed: 3_level_1
Autos & Vehicles,unknown,unknown,1942
Autos & Vehicles,Vehicle Parts & Services,unknown,403
Autos & Vehicles,Vehicle Shopping,unknown,297
Autos & Vehicles,Motor Vehicles (By Type),unknown,267
Autos & Vehicles,Vehicle Parts & Services,Vehicle Repair &,154
Autos & Vehicles,Motor Vehicles (By Type),Trucks & SUVs,78
Autos & Vehicles,Vehicle Parts & Services,Vehicle Parts & A,62
Autos & Vehicles,Motor Vehicles (By Type),Motorcycles,11
Autos & Vehicles,Vehicle Codes & Driving Laws,unknown,2
Autos & Vehicles,Campers & RVs,unknown,1


In [14]:
#@markdown Dialogues

print_dialogues(df, categorized_df, 3)


Topic    : Autos & Vehicles, 
Subtopic1: unknown, 
Subtopic2: unknown

#Person1#:Hi, I need to get a taxi out of kohinoor.
#Person2#:And when would you like to leave or arrive by?
#Person1#:I need to leave after 07:45.
#Person2#:I'd love to help! Where are you going?
#Person1#:I am going to allenbell
#Person2#:Your booking is complete, a black toyota will pick you up. The contact number is 07768938666.
#Person1#:Thats perfect that you for your help
#Person2#:You're welcome. Do you have any other concerns?
#Person1#:I don't. I appreciate your help. Goodbye for now.
#Person2#:OK, I'm glad I was able to assist you. If there ever is anything else don't hesitate to contact us.
----------------------------------------
Topic    : Autos & Vehicles, 
Subtopic1: unknown, 
Subtopic2: unknown

#Person1#:Yes. Yeah. Good evening. I want to book an appointment with Intelligent Auto Imports for my 2010 Subaru Outback.
#Person2#:hello.
#Person2#:how can i help you?
#Person2#:sure, i can help you with t