# NGB ChatBot

## Setup

In [71]:
from google.colab import drive
drive.mount('/content/mnt/', force_remount=True)

Mounted at /content/mnt/


In [165]:
ngb_path='/content/mnt/My Drive/ngb'

In [166]:
from os import listdir
for i in listdir(ngb_path): print(i)

ngb_data.txt
en_core_web_lg-2.3.1.tar.gz
ngb_data.csv


## Libraries required

In [74]:
# pip install wmd

In [75]:
# pip install -U spacy

In [121]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import re
import unicodedata
from collections import Counter
import tarfile
import nltk
import spacy
import wmd

In [None]:
with tarfile.open(f'{ngb_path}/en_core_web_lg-2.3.1.tar.gz', 'r:gz') as tref:
    tref.extractall()

In [239]:
nlp = spacy.load('en_core_web_lg-2.3.1/en_core_web_lg/en_core_web_lg-2.3.1')
# nlp.add_pipe(wmd.WMD.SpacySimilarityHook(nlp), last=True)
stop_words = nlp.Defaults.stop_words

## Data Preprocessing

In [167]:
df = pd.read_csv(f'{ngb_path}/ngb_data.csv', encoding='utf8')

In [168]:
df_numpy = df.to_numpy()
data = [unicodedata.normalize("NFKD", str(doc[0]).lower()) for doc in df_numpy]
for doc in data[:5]: print(doc)

ngb living is based in berlin and offers a constantly growing number of residential flats with fully furnished rooms to rent in prime super city locations in berlin.
ngb living official mail id is info@ngb-living.de
if the front door of the building is damaged or broken please take it pictures and raise the ticket immediately on yourplace app.
if elevator or lift is not working than contact the property management. this issue is addressed by the property management.
if doorbell is not working then please raise the ticket on yourplace app.


In [80]:
def  remove_stopwords(corpus):
    for i,doc in enumerate(corpus) :
        text = ''
        for token in nlp(doc):
            word = token.text
            if word not in stop_words and len(word)>1: 
                text = text + ' ' + word
        corpus[i] = text.strip()

    return corpus

In [198]:
def remove_punctuations(corpus):
    # symbols = "!\"#$%&()*+-./:;,<=>?@[\]^_`{|}~\n"
    # table = str.maketrans('', '', symbols)
    for i, doc in enumerate(corpus):
        sent = doc
        sent = re.sub(r'[^\w\s]', ' ', sent)
        sent = re.sub('\s*\\n+', ' ', sent)
        sent = re.sub('ngb\s*living', 'ngbliving', sent)
        corpus[i] = sent.strip()
        # corpus[i] = doc.translate(table)

    return corpus

In [199]:
def lemmatize(corpus):
    for i, doc in enumerate(corpus):
        tokens = nlp(doc)
        text = ''
        for token in tokens:
            if (token.text).isspace() or len(token.text)<3: continue
            text += token.lemma_ + ' '
        corpus[i] = text.strip()

    return corpus

In [297]:
def preprocess(corpus):
    corpus = remove_stopwords(corpus)
    corpus = remove_punctuations(corpus)
    corpus = lemmatize(corpus)

    return corpus

In [83]:
def get_vocab(data):
    wc = {}
    for doc in data:
        for token in nlp(doc):
            word = token.text
            try: wc[word] += 1
            except: wc[word] = 1

    return wc

def get_oov(data, wc):
    oov = {}
    for doc in data:
        for token in nlp(doc):
            if not token.has_vector and token.text not in oov: 
                oov[token.text] =  wc[token.text]

    return oov

In [298]:
pdata = data[:]
pdata = preprocess(pdata)

data[1], pdata[1]

('ngb living official mail id is info@ngb-living.de',
 'ngbliving official mail info ngbliving')

In [202]:
wc = get_vocab(pdata)
oov = get_oov(pdata, wc)
wc_top = sorted(wc.items(), key=lambda x: x[1])[::-1]
oov_top = sorted(oov.items(), key=lambda x: x[1])[::-1]

wc_top[:10]

[('contact', 35),
 ('room', 32),
 ('problem', 23),
 ('key', 21),
 ('office', 19),
 ('contract', 19),
 ('rent', 18),
 ('ngbliving', 18),
 ('damage', 17),
 ('check', 15)]

## TF-IDF

In [203]:
docs = [[token.text for token in nlp(doc) if not (token.text).isspace()] for doc in pdata]

### DF

In [204]:
DF = {}
for i, doc in enumerate(docs):
    for word in doc:
        try:
            DF[word].add(i)
        except:
            DF[word] = {i}

for i in DF: DF[i] = len(DF[i])

vocab = [w for w in DF]
print('total vocab:', len(vocab))

total vocab: 652


### TF and IDF

In [205]:
tf_idf = {}
N = len(docs)
for i, doc in enumerate(docs):
    counter = Counter(doc)
    for term in set(doc):
        tf = counter[term]/len(doc)
        df = DF[term]
        idf = np.log(N/(df+1))
        tf_idf[i, term] = tf * idf

for k in list(tf_idf)[:5]: print(k, tf_idf[k])

(0, 'grow') 0.1925408834888737
(0, 'constantly') 0.1925408834888737
(0, 'number') 0.1700150441495312
(0, 'furnish') 0.1925408834888737
(0, 'ngbliving') 0.07701635339554948


### Vectorization


In [206]:
docs_vector = np.zeros((N, len(vocab)))
for score in tf_idf:
    idx = vocab.index(score[1])
    docs_vector[score[0]][idx] = tf_idf[score]
 
docs_vector.shape

(64, 652)

## GloVe (tf-idf weighted averaged document vector)

In [301]:
def getWeightedVec(sent, i=0, q=False, tfidf=0):
    weights, vectors = [], []
    doc = nlp(sent)
    for token in doc:
        if token.has_vector:
            term = token.text
            if len(term) < 3: continue
            # if  q is False: 
            #     weight = tf_idf[i, term]
            # else: 
            #     weight = tfidf
            # weights.append(weight)
            vectors.append(token.vector)
    
    # try: doc_vec = np.average(vectors, weights=weights, axis=0)
    try: doc_vec = np.average(vectors, axis=0)
    except: return doc.vector

    return doc_vec

In [208]:
def getSentVectors(data):
    vectors = []
    for i, sent in enumerate(data):
        vector = getWeightedVec(sent, i)
        vectors.append(vector)

    return vectors

In [302]:
sent_vec = getSentVectors(pdata)
len(sent_vec)

64

## Cosine Similarity

In [93]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [94]:
print(f'1. {data[1]} \n2. {data[61]}')
print('\ncosine similarity: ', cosine_sim(sent_vec[1], sent_vec[61]))

1. ngb-living official mail id is info@ngb-living.de 
2. the basic rules for living in ngb-living home are:
ventilation: keep your rooms and the common areas well ventilated.
cleaning: keep the rooms and all the common areas clean. preferably, there should be a cleaning schedule for the housemates for equal distribution of cleaning roles and that the apartment stays clean.
minor repairs like light bulbs, toilet seats, shower hose, defective door handles, sockets, kitchen cutlery, small electrical equipment etc. are to be attended by the tenants at their own costs.
remember to turn off the heaters and close the windows, especially when you are going out. exorbitant heater bills will be charged additionally.
in case of emergencies, especially during weekends and non-working hours, you should always get in touch with the contacts posted in the hallway passage after the entrance.
this also applies to power outages, water problems and heating problems.
in the unlikely event of danger to you

## Query Handler

In [176]:
def get_query_tfidf(pquery):
    N = len(pdata)
    tokens = [token.text for token in nlp(pquery[0]) if not (token.text).isspace()]
    counter = Counter(tokens)
    for term in set(tokens):
        tf = counter[term]/len(tokens)
        try: df = DF[term]
        except: df = 0
        idf = np.log((N+1)/(df + 1))
    
    return tf * idf

### TF-IDF Query

In [177]:
def get_query_vector(pquery):
    q_vector = np.zeros((len(vocab)))
    tf_idf = get_query_tfidf(pquery)
    
    try:
        idx = vocab.index(term)
        q_vector[idx] = tf * idf
    except: pass
    
    return q_vector

In [178]:
def get_top_responses(q_vector, k=5):
    if k > len(pdata): k = len(pdata)
    scores = []
    for i, d_vec in enumerate(sent_vec):
        cos_score = cosine_sim(q_vector, d_vec)
        scores.append((cos_score, i))
    scores.sort(reverse=True)

    return [scores[i] for i in range(k)]

## Test Query

In [311]:
print('Please input a query> ')
query = input()
pquery = preprocess([''.join(query)])
print('pquery:', pquery)
q_tfidf = get_query_tfidf(pquery)
q_vector = getWeightedVec(sent=pquery[0], i=0, tfidf=q_tfidf, q=True)
responses = get_top_responses(q_vector)
top_doc_id = responses[0][1]

print(f'Answer> {df_numpy[top_doc_id][0]}\n')
print(responses)

Please input a query> 
what is the cancellation policy
pquery: ['cancellation policy']
Answer> According to the German clause , you are entitled to get 70% of the security deposit amount after 6 weeks and the remaining 30% after 6 months of departure.

[(0.68170726, 33), (0.63445646, 61), (0.6249736, 13), (0.6230175, 32), (0.61617666, 52)]


In [248]:
# res = nlp(pdata[20])
# q = nlp('room key lost')
# cosine_sim(q.vector, res.vector)

In [309]:
pdata[60]

'basic rule live ngbliving home ventilation room common area ventilate cleaning room common area clean preferably clean schedule housemate equal distribution clean role apartment stay clean minor repair like light bulb toilet seat shower hose defective door handle socket kitchen cutlery small electrical equipment etc attend tenant cost remember turn heater close window especially go exorbitant heater bill charge additionally case emergency especially weekend non working hour touch contact post hallway passage entrance apply power outage water problem heat problem unlikely event danger person health police 110 fire brigade ambulance 112'