# Importing the required modules

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import spacy
import re
import warnings 
warnings.filterwarnings('ignore')
import random

In [29]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
     -------------------------------------- 79.7/79.7 kB 221.8 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torchvision
  Downloading torchvision-0.12.0-cp39-cp39-win_amd64.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 1.0 MB/s eta 0:00:00
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py): started
  Building wheel for sentence-transformers (setup.py): finished with status 'done'
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.0-py3-none-any.whl size=120735 sha256=f61beb28f0900e4ed36e348db670a579326c3b2907b52ff1135e3209ac251875Note: you may need to restart the kernel to use updated packages.

  Stored in directory: c:\users\mitug\appdata\local\pip\cache\wheels\2b\11\3b\32a18fb9f2253b25d3d1a06f0a84

# Question Input

In [2]:
qs = input()

Is there a train connection between Delhi and Mumbai?


# Intent Classification

In [3]:
file = open(r"..\data\intent_classification_data.json")
data = json.load(file)

In [4]:
data

{'TrainCheck': ['Is X the train number of Y?', 'Does X have train number Y?'],
 'RouteCheck': ['Are X and Y connected by rail?',
  'Is there a train connecting X and Y?']}

In [5]:
intent_similiarity = dict()

In [10]:
def question_similiarity(sentence,question):
    nlp = spacy.load("en_core_web_sm")
    sent = nlp(sentence)
    q = nlp(question)
    return sent.similarity(q)

In [11]:
for i in data:
    intent_similiarity[i] = np.mean(list(map(question_similiarity,data[i],[qs]*len(data[i]))))
print(intent_similiarity)

{'TrainCheck': 0.5426949245536091, 'RouteCheck': 0.702159006316426}


# Entity Recognition

## Preparing training data

### Reading the dataset

In [90]:
trains = pd.read_csv("..\data\All_Indian_Trains.csv")

In [91]:
trains.head()

Unnamed: 0.1,Unnamed: 0,Train no.,Train name,Starts,Ends
0,0,12723,Andhra Pradesh Express,Hyderabad Decan,New Delhi
1,1,22416,Andhra Pradesh Express,New Delhi,Vishakapatnam
2,2,12724,Andhra Pradesh Express,New Delhi,Hyderabad Decan
3,3,12707,Andhra Pradesh Sampark Kranti,Tirupati,H Nizamuddin
4,4,15909,Abadh Assam Express,New Tinsukia Junction,Darbhanga Junction


In [92]:
trains = trains.drop(["Unnamed: 0"],axis=1)

In [93]:
trains.head()

Unnamed: 0,Train no.,Train name,Starts,Ends
0,12723,Andhra Pradesh Express,Hyderabad Decan,New Delhi
1,22416,Andhra Pradesh Express,New Delhi,Vishakapatnam
2,12724,Andhra Pradesh Express,New Delhi,Hyderabad Decan
3,12707,Andhra Pradesh Sampark Kranti,Tirupati,H Nizamuddin
4,15909,Abadh Assam Express,New Tinsukia Junction,Darbhanga Junction


### Seggragating different forms of questions

In [94]:
num2name = data["TrainCheck"][0]
name2number = data["TrainCheck"][1]
route_check = data["RouteCheck"]

### All the available unique entities in the dataset

In [95]:
all_train_nos = list(set(trains["Train no."]))
all_train_names = list(set(trains["Train name"]))
all_stations = list(set(list(trains["Starts"])+list(trains["Ends"])))

### Data Augmentation

In [96]:
training_data = []

#### Randomly sampling 20 entries of train numbers and names

In [102]:
train_nos_sampled = random.sample(all_train_nos,20)

In [103]:
train_names_sampled = random.sample(all_train_names,20)

#### Creating training data for TrainCheck

In [106]:
for i in list(zip(train_nos_sampled,train_names_sampled)):
    prepared_str = num2name.replace("X",str(i[0])).replace("Y",i[1])
    training_data.append((prepared_str,{"entities":[getFirstMatch(str(i[0]),prepared_str,"CARDINAL"),getFirstMatch(i[1],prepared_str,"FAC")]}))

In [109]:
for i in list(zip(train_nos_sampled,train_names_sampled)):
    prepared_str = name2number.replace("Y",str(i[0])).replace("X",i[1])
    training_data.append((prepared_str,{"entities":[getFirstMatch(str(i[0]),prepared_str,"CARDINAL"),getFirstMatch(i[1],prepared_str,"FAC")]}))

#### Creating training data for RouteCheck

In [112]:
for i in range(10):
    for qs in route_check:
        station1 = random.choice(all_stations)
        station2 = random.choice(all_stations)
        prepared_str = qs.replace("X",station1).replace("Y",station2)
        training_data.append((prepared_str,{"entities":[getFirstMatch(station1,prepared_str,"GPE"),getFirstMatch(station2,prepared_str,"GPE")]}))

#### Storing the generated training data in a json document

In [116]:
with open(r'..\data\ner_training_data.json', 'w') as f:
    json.dump({"training_data":training_data}, f)

## Fine tuning the NER pipeline

In [12]:
nlp = spacy.load("en_core_web_sm")
ner = nlp.get_pipe("ner")

In [None]:
for _, annotations in training_data:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

# Question Answering