## 02 - Intent Classifier
In this notebook, we will classify the intent to "find-hotel" or "find-restaurant" using lexcial approach (most indicative words associated with each intent)

In [1]:
import pandas as pd

In [2]:
train_file = "./data/train.csv"
train_df = pd.read_csv(train_file)
train_df.head()

Unnamed: 0,text,answer,intent,slots
0,"Guten Tag, I am staying overnight in Cambridge...","['find_hotel', 'hotel-area=centre', 'hotel-int...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye..."
1,Hi there! Can you give me some info on Cityroomz?,"['find_hotel', 'hotel-name=cityroomz']",find_hotel,{'hotel-name': 'cityroomz'}
2,I am looking for a hotel named alyesbray lodge...,"['find_hotel', 'hotel-name=alyesbray lodge gue...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'}
3,I am looking for a restaurant. I would like so...,"['find_restaurant', 'restaurant-food=chinese',...",find_restaurant,"{'restaurant-food': 'chinese', 'restaurant-pri..."
4,I'm looking for an expensive restaurant in the...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric..."


In [3]:
train_df["text_lower"] = train_df["text"].str.lower()
train_df.head()

Unnamed: 0,text,answer,intent,slots,text_lower
0,"Guten Tag, I am staying overnight in Cambridge...","['find_hotel', 'hotel-area=centre', 'hotel-int...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye...","guten tag, i am staying overnight in cambridge..."
1,Hi there! Can you give me some info on Cityroomz?,"['find_hotel', 'hotel-name=cityroomz']",find_hotel,{'hotel-name': 'cityroomz'},hi there! can you give me some info on cityroomz?
2,I am looking for a hotel named alyesbray lodge...,"['find_hotel', 'hotel-name=alyesbray lodge gue...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'},i am looking for a hotel named alyesbray lodge...
3,I am looking for a restaurant. I would like so...,"['find_restaurant', 'restaurant-food=chinese',...",find_restaurant,"{'restaurant-food': 'chinese', 'restaurant-pri...",i am looking for a restaurant. i would like so...
4,I'm looking for an expensive restaurant in the...,"['find_restaurant', 'restaurant-area=centre', ...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric...",i'm looking for an expensive restaurant in the...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
vectorizer = TfidfVectorizer()
x_train = train_df["text_lower"]
x_train_tfidf = vectorizer.fit_transform(x_train)
print(x_train_tfidf.shape)

(3760, 989)


In [6]:
# train using logistic regression

In [7]:
clf = LogisticRegression(random_state=42)
y_train = train_df["intent"]
print(y_train.shape)
clf.fit(x_train_tfidf, y_train)
print("Completed")

(3760,)
Completed


In [8]:
y_pred = clf.predict(x_train_tfidf)
print("Train score =", clf.score(x_train_tfidf, y_train))
print("Accuracy score =", accuracy_score(y_train, y_pred)) # should be the same

Train score = 0.9973404255319149
Accuracy score = 0.9973404255319149


In [9]:
# test on dev set
dev_utt = "./data/MultiWOZ/WOZ_dev_utt.txt"
# read in dev set utterance
lines = []
with open(dev_utt, encoding="utf-8") as f:
    lines = f.readlines()
    print("Length =", len(lines))

Length = 413


In [10]:
dev_df = pd.DataFrame({"text":lines})
dev_df.head()

Unnamed: 0,text
0,I'm looking for a local place to dine in the c...
1,My husband and I are celebrating our anniversa...
2,I'm looking for an expensive restaurant in the...
3,Are there any accommodations in the east part ...
4,"I'm looking for a nice place to stay, somewher..."


In [11]:
# read dev intent
dev_ans = "./data/MultiWOZ/WOZ_dev_ans.txt"
# read in dev set utterance
lines = []
with open(dev_ans, encoding="utf-8") as f:
    lines = f.readlines()
    print("Length =", len(lines))

Length = 413


In [12]:
dev_df["answer_raw"] = lines
dev_df.head()

Unnamed: 0,text,answer_raw
0,I'm looking for a local place to dine in the c...,find_restaurant|restaurant-area=centre|restaur...
1,My husband and I are celebrating our anniversa...,find_hotel\n
2,I'm looking for an expensive restaurant in the...,find_restaurant|restaurant-area=centre|restaur...
3,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes\n
4,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...


In [13]:
# test extract intent from answer raw
s = dev_df.iloc[0]["answer_raw"]
print(s)
print(s.split("|")[0])

find_restaurant|restaurant-area=centre|restaurant-food=chinese

find_restaurant


In [14]:
dev_df["intent"] = dev_df["answer_raw"].apply(lambda s:s.split("|")[0].strip())
dev_df.head()

Unnamed: 0,text,answer_raw,intent
0,I'm looking for a local place to dine in the c...,find_restaurant|restaurant-area=centre|restaur...,find_restaurant
1,My husband and I are celebrating our anniversa...,find_hotel\n,find_hotel
2,I'm looking for an expensive restaurant in the...,find_restaurant|restaurant-area=centre|restaur...,find_restaurant
3,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes\n,find_hotel
4,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...,find_hotel


In [15]:
x_dev = dev_df["text"]
x_dev_tfidf = vectorizer.transform(x_dev)
print(x_dev_tfidf.shape)
y_dev = dev_df["intent"]
dev_score = clf.score(x_dev_tfidf, y_dev)
dev_pred = clf.predict(x_dev_tfidf)
print("Dev accuracy =", accuracy_score(y_dev, dev_pred) * 100)
print("Dev accuracy =", dev_score * 100)

(413, 989)
Dev accuracy = 99.27360774818402
Dev accuracy = 99.27360774818402


In [16]:
from collections import Counter
Counter(dev_pred)

Counter({'find_restaurant': 218, 'find_hotel': 195})

In [17]:
# find where it predicted wrong
dev_df["predicted"] = dev_pred
wrong_df = dev_df.query("predicted != intent")
wrong_df

Unnamed: 0,text,answer_raw,intent,predicted
135,I would like to have an authentic British meal...,find_restaurant|restaurant-food=british|restau...,find_restaurant,find_hotel
329,Hi. I will be attending a conference in the ea...,find_hotel|hotel-area=east\n,find_hotel,find_restaurant
394,A friend recommended the City Centre North B&B...,find_hotel|hotel-name=city centre north b and b\n,find_hotel,find_restaurant


#### Predict intent on test set

In [18]:
# read in data
test_utt = "./data/MultiWOZ/WOZ_test_utt.txt"
# read in dev set utterance
lines = []
with open(test_utt, encoding="utf-8") as f:
    lines = f.readlines()
    print("Length =", len(lines))

Length = 400


In [19]:
test_df = pd.DataFrame({"text":lines})
print(len(test_df))
test_df.head()

400


Unnamed: 0,text
0,"Hello, I am looking for a restaurant in Cambri..."
1,"Hi, I'm looking for a hotel to stay in that in..."
2,I am looking for a place to stay in the north ...
3,"I need a place to dine, and I'd like to know w..."
4,I need a five starts hotel close to a mall and...


In [20]:
# pre-processing
test_df["text_lower"] = test_df["text"].str.lower()
test_df.head()

Unnamed: 0,text,text_lower
0,"Hello, I am looking for a restaurant in Cambri...","hello, i am looking for a restaurant in cambri..."
1,"Hi, I'm looking for a hotel to stay in that in...","hi, i'm looking for a hotel to stay in that in..."
2,I am looking for a place to stay in the north ...,i am looking for a place to stay in the north ...
3,"I need a place to dine, and I'd like to know w...","i need a place to dine, and i'd like to know w..."
4,I need a five starts hotel close to a mall and...,i need a five starts hotel close to a mall and...


In [21]:
# predict
x_test = test_df["text_lower"]
x_test_tfidf = vectorizer.transform(x_test)
print(x_test_tfidf.shape)
# y_test = test_df["intent"]
# dev_score = clf.score(x_dev_tfidf, y_dev)
test_pred = clf.predict(x_test_tfidf)
# print("Dev accuracy =", accuracy_score(y_dev, dev_pred) * 100)
# print("Dev accuracy =", dev_score * 100)

(400, 989)


In [22]:
test_df["predicted"] = test_pred
test_df

Unnamed: 0,text,text_lower,predicted
0,"Hello, I am looking for a restaurant in Cambri...","hello, i am looking for a restaurant in cambri...",find_restaurant
1,"Hi, I'm looking for a hotel to stay in that in...","hi, i'm looking for a hotel to stay in that in...",find_hotel
2,I am looking for a place to stay in the north ...,i am looking for a place to stay in the north ...,find_hotel
3,"I need a place to dine, and I'd like to know w...","i need a place to dine, and i'd like to know w...",find_restaurant
4,I need a five starts hotel close to a mall and...,i need a five starts hotel close to a mall and...,find_hotel
...,...,...,...
395,I am looking for a place to stay. The hotel sh...,i am looking for a place to stay. the hotel sh...,find_hotel
396,I am looking to book a hotel in the Cambridge ...,i am looking to book a hotel in the cambridge ...,find_hotel
397,I would like to go to an Indian restaurant in ...,i would like to go to an indian restaurant in ...,find_restaurant
398,I'm looking for a place to eat in the centre t...,i'm looking for a place to eat in the centre t...,find_restaurant


In [23]:
# write to output
test_output_fn = "./data/test_step1.csv"
test_df.to_csv(test_output_fn)
print("Output completed")

Output completed
