## 04 - Hotel Information Extraction - All slots
In this notebook, we try using simple classifier to extract all information on hotel like price-range, location, stars etc

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import pandas as pd

In [3]:
# filter for hotels
category = 'find_hotel'
slot_names = [#'hotel-name', 
              'hotel-stars', 'hotel-area', 
              'hotel-internet', 'hotel-pricerange', 'hotel-parking', 
              'hotel-type']

# category = 'find_restaurant'
# slot_names = [
#               'restaurant-food', 
#               # 'restaurant-name', 
#               'restaurant-pricerange', 
#               'restaurant-area']

In [4]:
train_file = "./data/train.csv"
train_df = pd.read_csv(train_file)

train_df.head()

Unnamed: 0,text,answer,intent,slots
0,"Guten Tag, I am staying overnight in Cambridge...","['find_hotel', 'hotel-area=centre', 'hotel-int...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye..."
1,Hi there! Can you give me some info on Cityroomz?,"['find_hotel', 'hotel-name=cityroomz']",find_hotel,{'hotel-name': 'cityroomz'}
2,I am looking for a hotel named alyesbray lodge...,"['find_hotel', 'hotel-name=alyesbray lodge gue...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'}
3,I am looking for a restaurant. I would like so...,"['find_restaurant', 'restaurant-food=chinese',...",find_restaurant,"{'restaurant-food': 'chinese', 'restaurant-pri..."
4,I'm looking for an expensive restaurant in the...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric..."


In [5]:
train_df["text_lower"] = train_df["text"].str.lower()

In [6]:
# convert slots to json objects
train_df["slots"] = train_df["slots"].apply(lambda s: eval(s))
train_df.head()

Unnamed: 0,text,answer,intent,slots,text_lower
0,"Guten Tag, I am staying overnight in Cambridge...","['find_hotel', 'hotel-area=centre', 'hotel-int...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye...","guten tag, i am staying overnight in cambridge..."
1,Hi there! Can you give me some info on Cityroomz?,"['find_hotel', 'hotel-name=cityroomz']",find_hotel,{'hotel-name': 'cityroomz'},hi there! can you give me some info on cityroomz?
2,I am looking for a hotel named alyesbray lodge...,"['find_hotel', 'hotel-name=alyesbray lodge gue...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'},i am looking for a hotel named alyesbray lodge...
3,I am looking for a restaurant. I would like so...,"['find_restaurant', 'restaurant-food=chinese',...",find_restaurant,"{'restaurant-food': 'chinese', 'restaurant-pri...",i am looking for a restaurant. i would like so...
4,I'm looking for an expensive restaurant in the...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric...",i'm looking for an expensive restaurant in the...


In [7]:
# filter to hotel only
train_df = train_df.query("intent==@category")
print(len(train_df))
train_df.reset_index(inplace=True)
train_df.drop(columns=["index"], inplace=True)
train_df.head()

1609


Unnamed: 0,text,answer,intent,slots,text_lower
0,"Guten Tag, I am staying overnight in Cambridge...","['find_hotel', 'hotel-area=centre', 'hotel-int...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye...","guten tag, i am staying overnight in cambridge..."
1,Hi there! Can you give me some info on Cityroomz?,"['find_hotel', 'hotel-name=cityroomz']",find_hotel,{'hotel-name': 'cityroomz'},hi there! can you give me some info on cityroomz?
2,I am looking for a hotel named alyesbray lodge...,"['find_hotel', 'hotel-name=alyesbray lodge gue...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'},i am looking for a hotel named alyesbray lodge...
3,I'm looking for a places to go and see during ...,['find_hotel'],find_hotel,{},i'm looking for a places to go and see during ...
4,I need a place to stay that has free wifi.,"['find_hotel', 'hotel-internet=yes']",find_hotel,{'hotel-internet': 'yes'},i need a place to stay that has free wifi.


In [8]:
# kw = 'hotel-pricerange'
# kw = 'hotel-stars'

# train_df[kw] = train_df["slots"].apply(lambda slots:slots.get(kw, ""))

In [9]:
# create Tfidf vectorizer

vectorizer = TfidfVectorizer()
x_train = train_df["text_lower"]
x_train_tfidf = vectorizer.fit_transform(x_train)
print(x_train_tfidf.shape)

(1609, 568)


In [10]:
# train using logistic regression

In [11]:
# kw = 'hotel-internet'
# create a master classifier, which contains classifier for each slot
master_clf = dict()

for slot in slot_names:
    print("Training for slot =", slot)
    clf = LogisticRegression(random_state=42)
    y_train = train_df["slots"].apply(lambda slots:slots.get(slot, ""))
    # print(y_train.shape)
    clf.fit(x_train_tfidf, y_train)
    print(f"Train score = {clf.score(x_train_tfidf, y_train) * 100:.2f} %")
    master_clf[slot] = clf
    print("------------")

print("Completed")

Training for slot = hotel-stars
Train score = 90.62 %
------------
Training for slot = hotel-area
Train score = 97.33 %
------------
Training for slot = hotel-internet
Train score = 96.58 %
------------
Training for slot = hotel-pricerange
Train score = 97.51 %
------------
Training for slot = hotel-parking
Train score = 96.95 %
------------
Training for slot = hotel-type
Train score = 98.38 %
------------
Completed


In [12]:
# create predicted dictionary for each item
train_df["pred_slots"] = train_df["text"].apply(lambda x: dict())
print(train_df.iloc[0]["pred_slots"])
train_df.head()

{}


Unnamed: 0,text,answer,intent,slots,text_lower,pred_slots
0,"Guten Tag, I am staying overnight in Cambridge...","['find_hotel', 'hotel-area=centre', 'hotel-int...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye...","guten tag, i am staying overnight in cambridge...",{}
1,Hi there! Can you give me some info on Cityroomz?,"['find_hotel', 'hotel-name=cityroomz']",find_hotel,{'hotel-name': 'cityroomz'},hi there! can you give me some info on cityroomz?,{}
2,I am looking for a hotel named alyesbray lodge...,"['find_hotel', 'hotel-name=alyesbray lodge gue...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'},i am looking for a hotel named alyesbray lodge...,{}
3,I'm looking for a places to go and see during ...,['find_hotel'],find_hotel,{},i'm looking for a places to go and see during ...,{}
4,I need a place to stay that has free wifi.,"['find_hotel', 'hotel-internet=yes']",find_hotel,{'hotel-internet': 'yes'},i need a place to stay that has free wifi.,{}


In [13]:
# double check train scores
for slot in slot_names:
    y_pred = master_clf[slot].predict(x_train_tfidf)
    # print("Train score =", master_clf[slot].score(x_train_tfidf, y_train))
    y_train = train_df["slots"].apply(lambda slots:slots.get(slot, ""))
    print("Accuracy score =", accuracy_score(y_train, y_pred) * 100) # should be the same

    # go through non-empty result and add to pred_slots
    for i, item in enumerate(y_pred):
        # print(i)
        if item is not None and item != "":
            item_slot = train_df.iloc[i]["pred_slots"]
            # print(item_slot)
            item_slot.update({slot:item})
            train_df.at[i, "pred_slots"] = item_slot

Accuracy score = 90.61528899937849
Accuracy score = 97.32753262896209
Accuracy score = 96.58172778123057
Accuracy score = 97.51398384089497
Accuracy score = 96.95463020509634
Accuracy score = 98.38408949658172


In [14]:
train_df.head()

Unnamed: 0,text,answer,intent,slots,text_lower,pred_slots
0,"Guten Tag, I am staying overnight in Cambridge...","['find_hotel', 'hotel-area=centre', 'hotel-int...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye...","guten tag, i am staying overnight in cambridge...",{'hotel-parking': 'yes'}
1,Hi there! Can you give me some info on Cityroomz?,"['find_hotel', 'hotel-name=cityroomz']",find_hotel,{'hotel-name': 'cityroomz'},hi there! can you give me some info on cityroomz?,{}
2,I am looking for a hotel named alyesbray lodge...,"['find_hotel', 'hotel-name=alyesbray lodge gue...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'},i am looking for a hotel named alyesbray lodge...,{}
3,I'm looking for a places to go and see during ...,['find_hotel'],find_hotel,{},i'm looking for a places to go and see during ...,{}
4,I need a place to stay that has free wifi.,"['find_hotel', 'hotel-internet=yes']",find_hotel,{'hotel-internet': 'yes'},i need a place to stay that has free wifi.,{'hotel-internet': 'yes'}


In [15]:
# get accuracy on train set
def get_accuracy(gold_slots, pred_slots, slot_lists=slot_names):
    """
        return accuracy of predicted slots vs gold slots in dictionary form
    """
    correct_count = 0
    for gold_slot, pred_slot in zip(gold_slots, pred_slots):
        gold = {k:v 
                for k, v in gold_slot.items()
                if k in slot_names
               }
        # print(gold)
        sys = {k:v 
                for k, v in pred_slot.items()
                if k in slot_names
               }
        # print(sys)
        # if gold_slot == pred_slot:
        if gold == sys:
            correct_count += 1
    return correct_count / len(gold_slots)

get_accuracy(train_df["slots"], train_df["pred_slots"])

0.800497203231821

In [16]:
# test on dev set
dev_filename = "./data/dev.csv"
dev_df = pd.read_csv(dev_filename)
print(dev_df.columns)
dev_df.head()

Index(['text', 'answer_raw', 'answer', 'intent', 'slots'], dtype='object')


Unnamed: 0,text,answer_raw,answer,intent,slots
0,I'm looking for a local place to dine in the c...,find_restaurant|restaurant-area=centre|restaur...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-food..."
1,My husband and I are celebrating our anniversa...,find_hotel,['find_hotel'],find_hotel,{}
2,I'm looking for an expensive restaurant in the...,find_restaurant|restaurant-area=centre|restaur...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric..."
3,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes,"['find_hotel', 'hotel-area=east', 'hotel-parki...",find_hotel,"{'hotel-area': 'east', 'hotel-parking': 'yes'}"
4,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...,"['find_hotel', 'hotel-internet=yes', 'hotel-pr...",find_hotel,"{'hotel-internet': 'yes', 'hotel-pricerange': ..."


In [17]:
# filter to hotel only
# category = 'find_hotel'
dev_df = dev_df.query("intent==@category")
print(len(dev_df))
dev_df.reset_index(inplace=True)
dev_df.drop(columns=["index"], inplace=True)
dev_df.head()

196


Unnamed: 0,text,answer_raw,answer,intent,slots
0,My husband and I are celebrating our anniversa...,find_hotel,['find_hotel'],find_hotel,{}
1,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes,"['find_hotel', 'hotel-area=east', 'hotel-parki...",find_hotel,"{'hotel-area': 'east', 'hotel-parking': 'yes'}"
2,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...,"['find_hotel', 'hotel-internet=yes', 'hotel-pr...",find_hotel,"{'hotel-internet': 'yes', 'hotel-pricerange': ..."
3,I'm looking for a 4 star hotel in the south.,find_hotel|hotel-area=south|hotel-stars=4,"['find_hotel', 'hotel-area=south', 'hotel-star...",find_hotel,"{'hotel-area': 'south', 'hotel-stars': '4'}"
4,I am looking to get some information on gonvil...,find_hotel|hotel-name=gonville hotel,"['find_hotel', 'hotel-name=gonville hotel']",find_hotel,{'hotel-name': 'gonville hotel'}


In [18]:
# apply some pre-processing
dev_df["text_lower"] = dev_df["text"].str.lower()
# convert slots to json objects
dev_df["slots"] = dev_df["slots"].apply(lambda s: eval(s))
dev_df.head()

Unnamed: 0,text,answer_raw,answer,intent,slots,text_lower
0,My husband and I are celebrating our anniversa...,find_hotel,['find_hotel'],find_hotel,{},my husband and i are celebrating our anniversa...
1,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes,"['find_hotel', 'hotel-area=east', 'hotel-parki...",find_hotel,"{'hotel-area': 'east', 'hotel-parking': 'yes'}",are there any accommodations in the east part ...
2,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...,"['find_hotel', 'hotel-internet=yes', 'hotel-pr...",find_hotel,"{'hotel-internet': 'yes', 'hotel-pricerange': ...","i'm looking for a nice place to stay, somewher..."
3,I'm looking for a 4 star hotel in the south.,find_hotel|hotel-area=south|hotel-stars=4,"['find_hotel', 'hotel-area=south', 'hotel-star...",find_hotel,"{'hotel-area': 'south', 'hotel-stars': '4'}",i'm looking for a 4 star hotel in the south.
4,I am looking to get some information on gonvil...,find_hotel|hotel-name=gonville hotel,"['find_hotel', 'hotel-name=gonville hotel']",find_hotel,{'hotel-name': 'gonville hotel'},i am looking to get some information on gonvil...


In [19]:
# extract price-range
# kw = 'hotel-pricerange'

# dev_df["target"] = dev_df["slots"].apply(lambda slots:slots.get(kw, ""))
# dev_df.head()

In [20]:
x_dev = dev_df["text"]
x_dev_tfidf = vectorizer.transform(x_dev)
print(x_dev_tfidf.shape)

(196, 568)


In [21]:
# create predicted dictionary for each item
dev_df["pred_slots"] = dev_df["text"].apply(lambda x: dict())
dev_df.head()

Unnamed: 0,text,answer_raw,answer,intent,slots,text_lower,pred_slots
0,My husband and I are celebrating our anniversa...,find_hotel,['find_hotel'],find_hotel,{},my husband and i are celebrating our anniversa...,{}
1,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes,"['find_hotel', 'hotel-area=east', 'hotel-parki...",find_hotel,"{'hotel-area': 'east', 'hotel-parking': 'yes'}",are there any accommodations in the east part ...,{}
2,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...,"['find_hotel', 'hotel-internet=yes', 'hotel-pr...",find_hotel,"{'hotel-internet': 'yes', 'hotel-pricerange': ...","i'm looking for a nice place to stay, somewher...",{}
3,I'm looking for a 4 star hotel in the south.,find_hotel|hotel-area=south|hotel-stars=4,"['find_hotel', 'hotel-area=south', 'hotel-star...",find_hotel,"{'hotel-area': 'south', 'hotel-stars': '4'}",i'm looking for a 4 star hotel in the south.,{}
4,I am looking to get some information on gonvil...,find_hotel|hotel-name=gonville hotel,"['find_hotel', 'hotel-name=gonville hotel']",find_hotel,{'hotel-name': 'gonville hotel'},i am looking to get some information on gonvil...,{}


In [22]:
for slot in slot_names:
    # y_dev = dev_df["target"]
    y_dev = dev_df["slots"].apply(lambda slots:slots.get(slot, ""))
    
    dev_pred = master_clf[slot].predict(x_dev_tfidf)
    # dev_score = clf.score(x_dev_tfidf, y_dev)
    print("Slot prediction =", slot)
    print(f"Dev accuracy = {accuracy_score(y_dev, dev_pred) * 100:.2f} %")

    # go through non-empty result and add to pred_slots
    for i, item in enumerate(dev_pred):
    # print(i)
        if item is not None and item != "":
            item_slot = dev_df.iloc[i]["pred_slots"]
            # print(item_slot)
            item_slot.update({slot:item})
            dev_df.at[i, "pred_slots"] = item_slot
            
    # print("Dev accuracy =", dev_score * 100)

Slot prediction = hotel-stars
Dev accuracy = 91.84 %
Slot prediction = hotel-area
Dev accuracy = 95.92 %
Slot prediction = hotel-internet
Dev accuracy = 98.47 %
Slot prediction = hotel-pricerange
Dev accuracy = 97.96 %
Slot prediction = hotel-parking
Dev accuracy = 95.92 %
Slot prediction = hotel-type
Dev accuracy = 96.43 %


In [23]:
dev_df.head()

Unnamed: 0,text,answer_raw,answer,intent,slots,text_lower,pred_slots
0,My husband and I are celebrating our anniversa...,find_hotel,['find_hotel'],find_hotel,{},my husband and i are celebrating our anniversa...,{}
1,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes,"['find_hotel', 'hotel-area=east', 'hotel-parki...",find_hotel,"{'hotel-area': 'east', 'hotel-parking': 'yes'}",are there any accommodations in the east part ...,"{'hotel-area': 'east', 'hotel-parking': 'yes'}"
2,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...,"['find_hotel', 'hotel-internet=yes', 'hotel-pr...",find_hotel,"{'hotel-internet': 'yes', 'hotel-pricerange': ...","i'm looking for a nice place to stay, somewher...",{'hotel-internet': 'yes'}
3,I'm looking for a 4 star hotel in the south.,find_hotel|hotel-area=south|hotel-stars=4,"['find_hotel', 'hotel-area=south', 'hotel-star...",find_hotel,"{'hotel-area': 'south', 'hotel-stars': '4'}",i'm looking for a 4 star hotel in the south.,"{'hotel-stars': '4', 'hotel-area': 'south'}"
4,I am looking to get some information on gonvil...,find_hotel|hotel-name=gonville hotel,"['find_hotel', 'hotel-name=gonville hotel']",find_hotel,{'hotel-name': 'gonville hotel'},i am looking to get some information on gonvil...,{}


In [24]:
# calculate overall accuracy 
score = get_accuracy(dev_df["slots"], dev_df["pred_slots"])
print(f"Overall accuracy = {score:.2f} %")

Overall accuracy = 0.80 %
