## 03 - Hotel Information Extraction
In this notebook, we try using simple classifier to extract some information on hotel like price-range, location, etc

In [1]:
import pandas as pd

In [2]:
train_file = "./data/train.csv"
train_df = pd.read_csv(train_file)
train_df.head()

Unnamed: 0,text,answer,intent,slots
0,"Guten Tag, I am staying overnight in Cambridge...","['find_hotel', 'hotel-area=centre', 'hotel-int...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye..."
1,Hi there! Can you give me some info on Cityroomz?,"['find_hotel', 'hotel-name=cityroomz']",find_hotel,{'hotel-name': 'cityroomz'}
2,I am looking for a hotel named alyesbray lodge...,"['find_hotel', 'hotel-name=alyesbray lodge gue...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'}
3,I am looking for a restaurant. I would like so...,"['find_restaurant', 'restaurant-food=chinese',...",find_restaurant,"{'restaurant-food': 'chinese', 'restaurant-pri..."
4,I'm looking for an expensive restaurant in the...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric..."


In [3]:
train_df["text_lower"] = train_df["text"].str.lower()

In [4]:
# convert slots to json objects
train_df["slots"] = train_df["slots"].apply(lambda s: eval(s))
train_df.head()

Unnamed: 0,text,answer,intent,slots,text_lower
0,"Guten Tag, I am staying overnight in Cambridge...","['find_hotel', 'hotel-area=centre', 'hotel-int...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye...","guten tag, i am staying overnight in cambridge..."
1,Hi there! Can you give me some info on Cityroomz?,"['find_hotel', 'hotel-name=cityroomz']",find_hotel,{'hotel-name': 'cityroomz'},hi there! can you give me some info on cityroomz?
2,I am looking for a hotel named alyesbray lodge...,"['find_hotel', 'hotel-name=alyesbray lodge gue...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'},i am looking for a hotel named alyesbray lodge...
3,I am looking for a restaurant. I would like so...,"['find_restaurant', 'restaurant-food=chinese',...",find_restaurant,"{'restaurant-food': 'chinese', 'restaurant-pri...",i am looking for a restaurant. i would like so...
4,I'm looking for an expensive restaurant in the...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric...",i'm looking for an expensive restaurant in the...


In [5]:
# filter to hotel only
category = 'find_hotel'
train_df = train_df.query("intent==@category")
print(len(train_df))
train_df.head()

1609


Unnamed: 0,text,answer,intent,slots,text_lower
0,"Guten Tag, I am staying overnight in Cambridge...","['find_hotel', 'hotel-area=centre', 'hotel-int...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye...","guten tag, i am staying overnight in cambridge..."
1,Hi there! Can you give me some info on Cityroomz?,"['find_hotel', 'hotel-name=cityroomz']",find_hotel,{'hotel-name': 'cityroomz'},hi there! can you give me some info on cityroomz?
2,I am looking for a hotel named alyesbray lodge...,"['find_hotel', 'hotel-name=alyesbray lodge gue...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'},i am looking for a hotel named alyesbray lodge...
5,I'm looking for a places to go and see during ...,['find_hotel'],find_hotel,{},i'm looking for a places to go and see during ...
9,I need a place to stay that has free wifi.,"['find_hotel', 'hotel-internet=yes']",find_hotel,{'hotel-internet': 'yes'},i need a place to stay that has free wifi.


In [6]:
# extract slot price-range
k = 60
# item = eval(train_df.iloc[k]["slots"])
item = train_df.iloc[k]["slots"]
print(item)
item.keys()

{'hotel-pricerange': 'moderate', 'hotel-type': 'guesthouse'}


dict_keys(['hotel-pricerange', 'hotel-type'])

In [7]:
# kw = 'hotel-pricerange'
kw = 'hotel-stars'
train_df["target"] = train_df["slots"].apply(lambda slots:slots.get(kw, ""))
train_df.head()

Unnamed: 0,text,answer,intent,slots,text_lower,target
0,"Guten Tag, I am staying overnight in Cambridge...","['find_hotel', 'hotel-area=centre', 'hotel-int...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye...","guten tag, i am staying overnight in cambridge...",
1,Hi there! Can you give me some info on Cityroomz?,"['find_hotel', 'hotel-name=cityroomz']",find_hotel,{'hotel-name': 'cityroomz'},hi there! can you give me some info on cityroomz?,
2,I am looking for a hotel named alyesbray lodge...,"['find_hotel', 'hotel-name=alyesbray lodge gue...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'},i am looking for a hotel named alyesbray lodge...,
5,I'm looking for a places to go and see during ...,['find_hotel'],find_hotel,{},i'm looking for a places to go and see during ...,
9,I need a place to stay that has free wifi.,"['find_hotel', 'hotel-internet=yes']",find_hotel,{'hotel-internet': 'yes'},i need a place to stay that has free wifi.,


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
vectorizer = TfidfVectorizer()
x_train = train_df["text_lower"]
x_train_tfidf = vectorizer.fit_transform(x_train)
print(x_train_tfidf.shape)

(1609, 568)


In [10]:
# train using logistic regression

In [11]:
# kw = 'hotel-internet'
clf = LogisticRegression(random_state=42)
y_train = train_df["target"]
print(y_train.shape)
clf.fit(x_train_tfidf, y_train)
print("Completed")

(1609,)
Completed


In [12]:
y_pred = clf.predict(x_train_tfidf)
print("Train score =", clf.score(x_train_tfidf, y_train))
print("Accuracy score =", accuracy_score(y_train, y_pred)) # should be the same

Train score = 0.906152889993785
Accuracy score = 0.906152889993785


In [13]:
# test on dev set
dev_filename = "./data/dev.csv"
dev_df = pd.read_csv(dev_filename)
print(dev_df.columns)
dev_df.head()

Index(['text', 'answer_raw', 'answer', 'intent', 'slots'], dtype='object')


Unnamed: 0,text,answer_raw,answer,intent,slots
0,I'm looking for a local place to dine in the c...,find_restaurant|restaurant-area=centre|restaur...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-food..."
1,My husband and I are celebrating our anniversa...,find_hotel,['find_hotel'],find_hotel,{}
2,I'm looking for an expensive restaurant in the...,find_restaurant|restaurant-area=centre|restaur...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric..."
3,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes,"['find_hotel', 'hotel-area=east', 'hotel-parki...",find_hotel,"{'hotel-area': 'east', 'hotel-parking': 'yes'}"
4,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...,"['find_hotel', 'hotel-internet=yes', 'hotel-pr...",find_hotel,"{'hotel-internet': 'yes', 'hotel-pricerange': ..."


In [14]:
# apply some pre-processing
dev_df["text_lower"] = dev_df["text"].str.lower()
# convert slots to json objects
dev_df["slots"] = dev_df["slots"].apply(lambda s: eval(s))
dev_df.head()

Unnamed: 0,text,answer_raw,answer,intent,slots,text_lower
0,I'm looking for a local place to dine in the c...,find_restaurant|restaurant-area=centre|restaur...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-food...",i'm looking for a local place to dine in the c...
1,My husband and I are celebrating our anniversa...,find_hotel,['find_hotel'],find_hotel,{},my husband and i are celebrating our anniversa...
2,I'm looking for an expensive restaurant in the...,find_restaurant|restaurant-area=centre|restaur...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric...",i'm looking for an expensive restaurant in the...
3,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes,"['find_hotel', 'hotel-area=east', 'hotel-parki...",find_hotel,"{'hotel-area': 'east', 'hotel-parking': 'yes'}",are there any accommodations in the east part ...
4,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...,"['find_hotel', 'hotel-internet=yes', 'hotel-pr...",find_hotel,"{'hotel-internet': 'yes', 'hotel-pricerange': ...","i'm looking for a nice place to stay, somewher..."


In [15]:
# extract price-range
# kw = 'hotel-pricerange'

dev_df["target"] = dev_df["slots"].apply(lambda slots:slots.get(kw, ""))
dev_df.head()

Unnamed: 0,text,answer_raw,answer,intent,slots,text_lower,target
0,I'm looking for a local place to dine in the c...,find_restaurant|restaurant-area=centre|restaur...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-food...",i'm looking for a local place to dine in the c...,
1,My husband and I are celebrating our anniversa...,find_hotel,['find_hotel'],find_hotel,{},my husband and i are celebrating our anniversa...,
2,I'm looking for an expensive restaurant in the...,find_restaurant|restaurant-area=centre|restaur...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric...",i'm looking for an expensive restaurant in the...,
3,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes,"['find_hotel', 'hotel-area=east', 'hotel-parki...",find_hotel,"{'hotel-area': 'east', 'hotel-parking': 'yes'}",are there any accommodations in the east part ...,
4,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...,"['find_hotel', 'hotel-internet=yes', 'hotel-pr...",find_hotel,"{'hotel-internet': 'yes', 'hotel-pricerange': ...","i'm looking for a nice place to stay, somewher...",


In [16]:
x_dev = dev_df["text"]
x_dev_tfidf = vectorizer.transform(x_dev)
print(x_dev_tfidf.shape)
y_dev = dev_df["target"]
dev_score = clf.score(x_dev_tfidf, y_dev)
dev_pred = clf.predict(x_dev_tfidf)
print("Slot prediction =", kw)
print("Dev accuracy =", accuracy_score(y_dev, dev_pred) * 100)
print("Dev accuracy =", dev_score * 100)

(413, 568)
Slot prediction = hotel-stars
Dev accuracy = 95.88377723970945
Dev accuracy = 95.88377723970945


In [17]:
from collections import Counter
Counter(dev_pred)

Counter({'': 385, '4': 26, '3': 2})

In [18]:
# find where it predicted wrong
dev_df["predicted"] = dev_pred
wrong_df = dev_df.query("predicted != target")
print(f"Number of wrong counts = {len(wrong_df)} / {len(dev_df)}")
wrong_df

Number of wrong counts = 17 / 413


Unnamed: 0,text,answer_raw,answer,intent,slots,text_lower,target,predicted
18,"Yes, hello. I need a place to crash so I'm thi...",find_hotel|hotel-stars=0|hotel-type=guesthouse,"['find_hotel', 'hotel-stars=0', 'hotel-type=gu...",find_hotel,"{'hotel-stars': '0', 'hotel-type': 'guesthouse'}","yes, hello. i need a place to crash so i'm thi...",0.0,4.0
47,"Hello, I am trying to find a place to stay tha...",find_hotel|hotel-internet=yes|hotel-stars=3,"['find_hotel', 'hotel-internet=yes', 'hotel-st...",find_hotel,"{'hotel-internet': 'yes', 'hotel-stars': '3'}","hello, i am trying to find a place to stay tha...",3.0,
87,I am looking for a place to stay in Cambridge....,find_hotel|hotel-parking=yes|hotel-stars=4,"['find_hotel', 'hotel-parking=yes', 'hotel-sta...",find_hotel,"{'hotel-parking': 'yes', 'hotel-stars': '4'}",i am looking for a place to stay in cambridge....,4.0,
94,I am looking for a 3 star hotel on the south s...,find_hotel|hotel-area=south|hotel-stars=3,"['find_hotel', 'hotel-area=south', 'hotel-star...",find_hotel,"{'hotel-area': 'south', 'hotel-stars': '3'}",i am looking for a 3 star hotel on the south s...,3.0,4.0
98,I need to find a guesthouse with a 0 star rating.,find_hotel|hotel-stars=0|hotel-type=guesthouse,"['find_hotel', 'hotel-stars=0', 'hotel-type=gu...",find_hotel,"{'hotel-stars': '0', 'hotel-type': 'guesthouse'}",i need to find a guesthouse with a 0 star rating.,0.0,4.0
109,Hello! I am looking for a place to stay. I wou...,find_hotel|hotel-pricerange=moderate|hotel-sta...,"['find_hotel', 'hotel-pricerange=moderate', 'h...",find_hotel,"{'hotel-pricerange': 'moderate', 'hotel-stars'...",hello! i am looking for a place to stay. i wou...,2.0,
155,Can you give me any info on whether you have a...,find_hotel|hotel-stars=2,"['find_hotel', 'hotel-stars=2']",find_hotel,{'hotel-stars': '2'},can you give me any info on whether you have a...,2.0,
193,Hi there. I'm going on vacation and need a 3-s...,find_hotel|hotel-stars=3,"['find_hotel', 'hotel-stars=3']",find_hotel,{'hotel-stars': '3'},hi there. i'm going on vacation and need a 3-s...,3.0,
240,"Hello, could you help me with finding a restau...",find_restaurant|restaurant-name=the lucky star,"['find_restaurant', 'restaurant-name=the lucky...",find_restaurant,{'restaurant-name': 'the lucky star'},"hello, could you help me with finding a restau...",,4.0
243,I'm looking for a 0 star hotel that is expensive.,find_hotel|hotel-pricerange=expensive|hotel-st...,"['find_hotel', 'hotel-pricerange=expensive', '...",find_hotel,"{'hotel-pricerange': 'expensive', 'hotel-stars...",i'm looking for a 0 star hotel that is expensive.,0.0,4.0


In [19]:
# find where it predicted correctly, and not null
non_null_count = len(dev_df.query("target != ''"))
dev_df["predicted"] = dev_pred
correct_df = dev_df.query("predicted != '' and predicted == target")
print(f"Number of correct counts = {len(correct_df)} / {non_null_count}")
correct_df

Number of correct counts = 18 / 34


Unnamed: 0,text,answer_raw,answer,intent,slots,text_lower,target,predicted
5,I'm looking for a 4 star hotel in the south.,find_hotel|hotel-area=south|hotel-stars=4,"['find_hotel', 'hotel-area=south', 'hotel-star...",find_hotel,"{'hotel-area': 'south', 'hotel-stars': '4'}",i'm looking for a 4 star hotel in the south.,4,4
33,I need to book a hotel in the east that has 4 ...,find_hotel|hotel-area=east|hotel-stars=4,"['find_hotel', 'hotel-area=east', 'hotel-stars...",find_hotel,"{'hotel-area': 'east', 'hotel-stars': '4'}",i need to book a hotel in the east that has 4 ...,4,4
40,Hello. I'm looking for a hotel on the north en...,find_hotel|hotel-area=north|hotel-stars=4,"['find_hotel', 'hotel-area=north', 'hotel-star...",find_hotel,"{'hotel-area': 'north', 'hotel-stars': '4'}",hello. i'm looking for a hotel on the north en...,4,4
61,I'm looking for a place to stay. I would like ...,find_hotel|hotel-stars=4|hotel-type=guesthouse,"['find_hotel', 'hotel-stars=4', 'hotel-type=gu...",find_hotel,"{'hotel-stars': '4', 'hotel-type': 'guesthouse'}",i'm looking for a place to stay. i would like ...,4,4
62,i am looking for a place to stay. The hotel sh...,find_hotel|hotel-stars=4|hotel-type=guesthouse,"['find_hotel', 'hotel-stars=4', 'hotel-type=gu...",find_hotel,"{'hotel-stars': '4', 'hotel-type': 'guesthouse'}",i am looking for a place to stay. the hotel sh...,4,4
91,Hi I am looking for a 4 star guesthouse. Can y...,find_hotel|hotel-stars=4|hotel-type=guesthouse,"['find_hotel', 'hotel-stars=4', 'hotel-type=gu...",find_hotel,"{'hotel-stars': '4', 'hotel-type': 'guesthouse'}",hi i am looking for a 4 star guesthouse. can y...,4,4
127,I need a 4 star hotel on the north side.,find_hotel|hotel-area=north|hotel-stars=4,"['find_hotel', 'hotel-area=north', 'hotel-star...",find_hotel,"{'hotel-area': 'north', 'hotel-stars': '4'}",i need a 4 star hotel on the north side.,4,4
130,I'd like to find a 4 star hotel with free wifi...,find_hotel|hotel-internet=yes|hotel-stars=4,"['find_hotel', 'hotel-internet=yes', 'hotel-st...",find_hotel,"{'hotel-internet': 'yes', 'hotel-stars': '4'}",i'd like to find a 4 star hotel with free wifi...,4,4
131,"I'd like a 4 star hotel in the west, please.",find_hotel|hotel-area=west|hotel-stars=4,"['find_hotel', 'hotel-area=west', 'hotel-stars...",find_hotel,"{'hotel-area': 'west', 'hotel-stars': '4'}","i'd like a 4 star hotel in the west, please.",4,4
151,"I am looking for a place to stay. Ideally, it ...",find_hotel|hotel-stars=4|hotel-type=guesthouse,"['find_hotel', 'hotel-stars=4', 'hotel-type=gu...",find_hotel,"{'hotel-stars': '4', 'hotel-type': 'guesthouse'}","i am looking for a place to stay. ideally, it ...",4,4
