clone the repo
https://github.com/NafisSadeq/nlu-with-bert.git 


install
```
cd nlu-with-bert/
mkvirtualenv intuit_corpus -p python3.6
pip install -e .
python -m spacy download en_core_web_sm
```

Corpus prep
```
mkdir new_corpus
cd new_corpus/
```

inside the 'new_corpus' directory 
make the 'raw' directory, copy the raw corpus
make the 'configs' directory, prepare configs json file for training and eval
prepare 'preprocess.py'


Run preprocessing
It should read data from raw directory and save 6 files inside a directory called 'data'

train_data.json, val_data.json, test_data.json, intent_vocab.json, slot_vocab.json, tag_vocab.json
```
python preprocess.py all
```

Run training
```
python train.py --config new_corpus/configs/new_corpus.json
```

Run evaluation
```
python test.py --config new_corpus/configs/new_corpus.json
```

In [22]:
import json

data_dir="new_corpus/data/all_data"

with open(data_dir+"/val_data.json",'r') as file:
    sample_list=json.load(file)

In [23]:
len(sample_list),type(sample_list)

(14691, list)

In [24]:
# sample_list[25] -> This is one sample, which is a list of five items [tokens,token_tags,intent,slot_info,dialogue_context]

# let s be one sample
# s[0] -> list of words for the utterance of first sample.
# s[1] -> A list of values which indicates whether the word in that index is associated with any slot value
# s[2] -> List of intents for this utterance.
# s[3] -> List of slots for this utterance
# s[4] -> this is dialogue context or history
sample_list[25] 

[['I',
  'am',
  'also',
  'looking',
  'for',
  'a',
  'multi',
  'sport',
  'in',
  'the',
  'East',
  '.'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-Attraction-Inform+Area',
  'O'],
 [],
 [['Attraction-Inform', 'Area', 'east']],
 ['The earliest after 18:45 is the TR8658 , leaving Norwich at 19:16 . Can I reserve you one or more seats on this train ?',
  'yeah , i need one ticket',
  'Booking was successful , the total fee is 17.6 GBP payable at the station . Your reference number is AXH1NM1I . Do you need assistance with anything else ?']]

In [25]:
sample_list[25][1]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-Attraction-Inform+Area',
 'O']

In [26]:
with open(data_dir+"/intent_vocab.json",'r') as file:
    intent_voc=json.load(file)

In [27]:
intent_voc

['Taxi-Request',
 'general-thank',
 'general-reqmore',
 'general-bye',
 'Restaurant-Inform',
 'Restaurant-Request',
 'Attraction-Request',
 'Restaurant-Select',
 'Booking-Inform',
 'Booking-NoBook',
 'Booking-Request',
 'Taxi-Inform',
 'Train-Request',
 'general-welcome',
 'Hotel-Inform',
 'Train-OfferBook',
 'Hotel-Request',
 'Train-Inform',
 'general-greet',
 'Attraction-Inform',
 'Hotel-Recommend',
 'Attraction-Select',
 'Hotel-Select',
 'Train-Select',
 'Hotel-NoOffer',
 'Restaurant-NoOffer']

In [28]:
raw_sample={"text":"I am looking to book a train that is leaving from Cambridge to Bishops Stortford on Friday .",
            "intent":["Train-Inform"],
           "span_info": [
          ["Train-Inform","Dest","bishops stortford",13,14],
          ["Train-Inform","Day","friday",16,16],
          ["Train-Inform","Depart","cambridge",11,11]
        ]}

raw_sample2={"text":"I am looking to book a train that is leaving from Cambridge to Bishops Stortford on Friday .",
            "intent":["Train-Inform"],
           "span_info": []}

In [29]:
def process_sample(s,context):
    text=s["text"]
    intent=s["intent"]
    span_list=s["span_info"]
    
    tokens=text.split()
    tags = []
    slots = []
    
    for span in span_list:
        slots.append(span[:3])
    
    for i in range(len(tokens)):
        
        for span in span_list:
            
            if i == span[3]:
                tags.append("B-" + span[0] + "+" + span[1])
                break
            if span[3] < i <= span[4]:
                tags.append("I-" + span[0] + "+" + span[1])
                break
        else:
            tags.append("O")
            
    new_s=[tokens,tags,intent,slots,context]
    
    return new_s        

In [30]:
sample=process_sample(raw_sample,[])
print(len(sample[0]))
print(len(sample[1]))
sample

18
18


[['I',
  'am',
  'looking',
  'to',
  'book',
  'a',
  'train',
  'that',
  'is',
  'leaving',
  'from',
  'Cambridge',
  'to',
  'Bishops',
  'Stortford',
  'on',
  'Friday',
  '.'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-Train-Inform+Depart',
  'O',
  'B-Train-Inform+Dest',
  'I-Train-Inform+Dest',
  'O',
  'B-Train-Inform+Day',
  'O'],
 ['Train-Inform'],
 [['Train-Inform', 'Dest', 'bishops stortford'],
  ['Train-Inform', 'Day', 'friday'],
  ['Train-Inform', 'Depart', 'cambridge']],
 []]

In [31]:
def process_dialog(dialog):

    context=[]
    processed_sample_list=[]

    for turn in dialog:

        processed_sample=process_sample(turn,context[:])
        processed_sample_list.append(processed_sample)
        context.append(turn["text"])
        
    return processed_sample_list

In [32]:
dialog=[raw_sample]*3
processed_sample_list=process_dialog(dialog)
processed_sample_list

[[['I',
   'am',
   'looking',
   'to',
   'book',
   'a',
   'train',
   'that',
   'is',
   'leaving',
   'from',
   'Cambridge',
   'to',
   'Bishops',
   'Stortford',
   'on',
   'Friday',
   '.'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-Train-Inform+Depart',
   'O',
   'B-Train-Inform+Dest',
   'I-Train-Inform+Dest',
   'O',
   'B-Train-Inform+Day',
   'O'],
  ['Train-Inform'],
  [['Train-Inform', 'Dest', 'bishops stortford'],
   ['Train-Inform', 'Day', 'friday'],
   ['Train-Inform', 'Depart', 'cambridge']],
  []],
 [['I',
   'am',
   'looking',
   'to',
   'book',
   'a',
   'train',
   'that',
   'is',
   'leaving',
   'from',
   'Cambridge',
   'to',
   'Bishops',
   'Stortford',
   'on',
   'Friday',
   '.'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-Train-Inform+Depart',
   'O',
   'B-Train-Inform+Dest',
   'I-Train-Inform+Dest',
   'O',
   'B-Train-Inform+Day',
   'O'],


In [33]:
with open(data_dir+"/dummy_data.json",'w') as file:
    json.dump(processed_sample_list,file,indent=2)

In [34]:
#Config json format 