# Library Import

In [35]:
import numpy as np
import pandas as pd
from googletrans import Translator
import Levenshtein as L
from autocorrect import Speller
from spello.model import SpellCorrectionModel
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

import speech_recognition as sr
from nltk.tokenize import word_tokenize

# Task 1: Translator
----------
Translate each word or sentence from English to Spanish, French and German 

In [None]:
df = pd.read_csv('CSV/English.csv')
df.head()

Unnamed: 0,English words/sentences
0,Hi.
1,Run!
2,Run!
3,Who?
4,Wow!


In [None]:
df.describe()

Unnamed: 0,English words/sentences
count,175621
unique,123100
top,I can't tell you how happy I am that you've co...
freq,32


In [None]:
df.isnull().sum()

English words/sentences    0
dtype: int64

In [10]:
# Taking only 100 samples as current hardware deosn't have capability to compute for 175621 samples
df1 = df.sample(n=100) 
df1

Unnamed: 0,English words/sentences
23559,How did you get in?
13200,You smell awful.
154161,I think he can get along with his neighbors.
110732,"Now, what else can I do for you?"
83359,I shouldn't have interfered.
...,...
118953,Don't think about stuff like that.
74647,He advised me not to smoke.
39089,You have been warned.
153881,"I arrived late, so I didn't hear everything."


In [11]:
translator = Translator()

df1['Spanish'] = df1['English words/sentences'].apply(translator.translate, src='en', dest='es').apply(getattr, args=('text',))
df1['French'] = df1['English words/sentences'].apply(translator.translate, src='en', dest='fr').apply(getattr, args=('text',))
df1['German'] = df1['English words/sentences'].apply(translator.translate, src='en', dest='de').apply(getattr, args=('text',))

In [12]:
df1

Unnamed: 0,English words/sentences,Spanish,French,German
23559,How did you get in?,¿Cómo entraste?,Comment êtes-vous entré?,Wie bist du reingekommen?
13200,You smell awful.,Hueles horrible.,Vous sentez mauvais.,Du riechst schrecklich.
154161,I think he can get along with his neighbors.,Creo que puede llevarse bien con sus vecinos.,Je pense qu'il peut s'entendre avec ses voisins.,"Ich denke, er kommt mit seinen Nachbarn zurecht."
110732,"Now, what else can I do for you?","Ahora, ¿qué más puedo hacer por ti?","Maintenant, que puis-je faire d'autre pour vous ?",Was kann ich sonst noch für Sie tun?
83359,I shouldn't have interfered.,No debería haber interferido.,Je n'aurais pas dû intervenir.,Ich hätte mich nicht einmischen sollen.
...,...,...,...,...
118953,Don't think about stuff like that.,No pienses en cosas así.,Ne pense pas à des trucs comme ça.,Denk nicht an solche Sachen.
74647,He advised me not to smoke.,Me aconsejó que no fumara.,Il m'a conseillé de ne pas fumer.,"Er riet mir, nicht zu rauchen."
39089,You have been warned.,Usted ha sido advertido.,Tu étais prévenu.,Du wurdest gewarnt.
153881,"I arrived late, so I didn't hear everything.","Llegué tarde, así que no escuché todo.","Je suis arrivé en retard, donc je n'ai pas tou...","Ich kam spät an, also habe ich nicht alles geh..."


# Task 2: Spell Correction
--------
Program to correct the spelling from the word or a given sentence.

In [2]:
# Reading 50 Misspelled words
df2 = pd.read_csv('CSV/aspell.txt', delimiter=':', header=None)
df2 = df2.iloc[:50, :]
# Reading 50 Misspelled sentences
df3 = pd.read_csv('CSV/aspell1.csv', delimiter=',', header=None)
df3 = df3.iloc[:50, :]
df2 = df2.append(df3, ignore_index=True)
df2.rename(columns={0:'Original', 1:'Misspelled'}, inplace=True)
df2

  df2 = df2.append(df3, ignore_index=True)


Unnamed: 0,Original,Misspelled
0,Nevada,nevade
1,Presbyterian,presbyterian
2,RSX,rsx
3,Stephen,Steffen
4,Susan,susan
...,...,...
95,moree attack prompts police warning,ormee attack prompts police warning
96,receding floodwater leaves layer of sludge,receding fKokdwater lSaCes layer of sludge
97,home construction figures fall abs statistics,home construction fiugrse flal abs statistics
98,nyiaparli people claim fortescue forced them to,nyia)Wrli people clA(m foDtesFue forced them to


## Using AutoCorrect - Speller

In [None]:
spell = Speller(lang='en')
df2['Corrected_Speller'] = df2['Misspelled'].apply(spell)

### Levenshtein Distance

In [4]:
df2['L_Corrected_Speller'] = df2.apply(lambda x: L.ratio(x['Corrected_Speller'], x['Original']), axis=1)
df2

Unnamed: 0,Original,Misspelled,Corrected_Speller,L_Corrected_Speller
0,Nevada,nevade,evade,0.666667
1,Presbyterian,presbyterian,presbyterian,0.880000
2,RSX,rsx,rs,0.000000
3,Stephen,Steffen,Stephen,0.933333
4,Susan,susan,susan,0.727273
...,...,...,...,...
95,moree attack prompts police warning,ormee attack prompts police warning,free attack prompts police warning,0.956522
96,receding floodwater leaves layer of sludge,receding fKokdwater lSaCes layer of sludge,preceding fKokdwater leaves layer of sludge,0.941176
97,home construction figures fall abs statistics,home construction fiugrse flal abs statistics,home construction figure flag abs statistics,0.943820
98,nyiaparli people claim fortescue forced them to,nyia)Wrli people clA(m foDtesFue forced them to,nia)Wali people cl(m foDtesFue forced them to,0.869565


## Using Spello - SpellCorrectionModel

In [5]:
with open('CSV/big.txt', 'r') as f:
    big = f.readlines()
big = [i.strip() for i in big]

In [6]:
# Remove tab (\t)
big = [re.sub('\\t', ' ', text) for text in big]
# Remove \\
big = [re.sub("\\'", ' ', text) for text in big]
# Remove ''
big = [text for text in big if text != '']
# Remove special characters
big = [re.sub(r'[^a-zA-Z]+', ' ', text) for text in big]
# Remove leading and trailing spaces
big = [text.strip() for text in big]

### Train Model

In [7]:
sp = SpellCorrectionModel(language='en')
sp.train(big)

Spello training started..
Context model training started ...
Symspell training started ...
Phoneme training started ...
Spello training completed successfully ...


In [None]:
df2['Spello_Corrected'] = df2['Misspelled'].apply(sp.spell_correct)
x = [x['spell_corrected_text'] for x in df2.Spello_Corrected]
df2.Spello_Corrected = x

### Levenshtein Distance

In [18]:
df2['L_Spello_Corrected'] = df2.apply(lambda x: L.ratio(x['Spello_Corrected'], x['Original']), axis=1)
df2

Unnamed: 0,Original,Misspelled,Corrected_Speller,L_Corrected_Speller,Spello_Corrected,L_Spello_Corrected
0,Nevada,nevade,evade,0.666667,nevada,0.833333
1,Presbyterian,presbyterian,presbyterian,0.880000,presbyterians,0.880000
2,RSX,rsx,rs,0.000000,rise,0.000000
3,Stephen,Steffen,Stephen,0.933333,stiffen,0.428571
4,Susan,susan,susan,0.727273,susan,0.800000
...,...,...,...,...,...,...
95,moree attack prompts police warning,ormee attack prompts police warning,free attack prompts police warning,0.956522,armee attack prompt police warning,0.927536
96,receding floodwater leaves layer of sludge,receding fKokdwater lSaCes layer of sludge,preceding fKokdwater leaves layer of sludge,0.941176,receding fKokdwater faces layer of sledge,0.867470
97,home construction figures fall abs statistics,home construction fiugrse flal abs statistics,home construction figure flag abs statistics,0.943820,home construction figures fall as statistics,0.988764
98,nyiaparli people claim fortescue forced them to,nyia)Wrli people clA(m foDtesFue forced them to,nia)Wali people cl(m foDtesFue forced them to,0.869565,nyia)Wrli people claim foDtesFue forced them to,0.914894


In [19]:
df2.L_Corrected_Speller.sum(), df2.L_Spello_Corrected.sum() # BOTH SHOW SIMILAR PERFORMACE

(84.54672358191894, 84.72160985573319)

# Task 3: Resume Filter
---------
Create an application that should be used by the HR Team to filter the resume based on the Skills.

In [20]:
df = pd.read_csv('CSV/UpdatedResumeDataSet.csv')
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  962 non-null    object
 1   Resume    962 non-null    object
dtypes: object(2)
memory usage: 15.2+ KB


In [22]:
df.isnull().sum()

Category    0
Resume      0
dtype: int64

In [23]:
df.Category.value_counts()

Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Blockchain                   40
ETL Developer                40
Operations Manager           40
Data Science                 40
Sales                        40
Mechanical Engineer          40
Arts                         36
Database                     33
Electrical Engineering       30
Health and fitness           30
PMO                          30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
SAP Developer                24
Civil Engineer               24
Advocate                     20
Name: Category, dtype: int64

## Text Pre-Processing

In [25]:
def clean_function(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

df['Cleaned_Resume'] = df.Resume.apply(lambda x: clean_function(x))
df.Category = LabelEncoder().fit_transform(df.Category)
df.head()

Unnamed: 0,Category,Resume,Cleaned_Resume
0,6,Skills * Programming Languages: Python (pandas...,Skills Programming Languages Python pandas num...
1,6,Education Details \r\nMay 2013 to May 2017 B.E...,Education Details May 2013 to May 2017 B E UIT...
2,6,"Areas of Interest Deep Learning, Control Syste...",Areas of Interest Deep Learning Control System...
3,6,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,Skills R Python SAP HANA Tableau SAP HANA SQL ...
4,6,"Education Details \r\n MCA YMCAUST, Faridab...",Education Details MCA YMCAUST Faridabad Haryan...


## Building Model

In [27]:
X = df.Cleaned_Resume.values
Y = df.Category.values

# using TF_IDF vectorizer
tf_idf = TfidfVectorizer(sublinear_tf=True, stop_words='english')
WordFeature = tf_idf.fit_transform(X)
WordFeature.shape

(962, 7351)

In [29]:
X_train, X_test, Y_train , Y_test = train_test_split(WordFeature, Y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

(769, 7351) (193, 7351)


In [32]:
models = {
    'K-Nearest Neighbors' : KNeighborsClassifier(),
    'Logistic Regression' : LogisticRegression(),
    'Support Vector Machine' : SVC(),
    'Random Forest' : RandomForestClassifier()    
}

for k, v in models.items():
    v.fit(X_train, Y_train)
    print(k, 'trained')
print('all trained')

K-Nearest Neighbors trained
Logistic Regression trained
Support Vector Machine trained
Random Forest trained
all trained


In [36]:
for k, v in models.items():
    print('Accuracy of', k, 'for training data: ', v.score(X_train, Y_train))
    print('Accuracy of', k, 'for test data: ', v.score(X_test, Y_test))
    print('-'*100)

Accuracy of K-Nearest Neighbors for training data:  0.9804941482444733
Accuracy of K-Nearest Neighbors for test data:  0.9792746113989638
----------------------------------------------------------------------------------------------------
Accuracy of Logistic Regression for training data:  1.0
Accuracy of Logistic Regression for test data:  0.9948186528497409
----------------------------------------------------------------------------------------------------
Accuracy of Support Vector Machine for training data:  1.0
Accuracy of Support Vector Machine for test data:  0.9948186528497409
----------------------------------------------------------------------------------------------------
Accuracy of Random Forest for training data:  1.0
Accuracy of Random Forest for test data:  0.9844559585492227
----------------------------------------------------------------------------------------------------


In [38]:
for k, v in models.items():
    print('Classification Report of', k, 'for training data: \n', 
                                            classification_report(Y_test, v.predict(X_test)))
    print('-'*100)

Classification Report of K-Nearest Neighbors for training data: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00         4
           5       1.00      1.00      1.00         9
           6       0.75      0.60      0.67         5
           7       1.00      0.88      0.93         8
           8       1.00      0.93      0.96        14
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         7
          11       1.00      1.00      1.00         6
          12       1.00      1.00      1.00        12
          13       1.00      1.00      1.00         4
          14       1.00      1.00      1.00         7
          15       1.00      1.00      1.00        15
          16   

## Cross Validation for Models

In [46]:
sp = StratifiedKFold(n_splits=10)
result = {}
for k, v in models.items():
    score = cross_val_score(v, X_train, Y_train, cv=sp,  scoring='accuracy')
    result[k] = score

In [48]:
for k, v in result.items():
    print('F1 Score for ', k, 'is: ', v.mean())
    print('-'*100)

F1 Score for  K-Nearest Neighbors is:  0.9662166780587832
----------------------------------------------------------------------------------------------------
F1 Score for  Logistic Regression is:  0.990909090909091
----------------------------------------------------------------------------------------------------
F1 Score for  Support Vector Machine is:  0.9921907040328092
----------------------------------------------------------------------------------------------------
F1 Score for  Random Forest is:  0.9960868079289131
----------------------------------------------------------------------------------------------------


# Task 4: Chatbot
---------
Create a chatbot for Hotel Management to Book Rooms
1. Details collected from : Customer Name, Mobile Number, Address, ID proof, and Room Type and date of arrival and departure date. Keep some eligibility to Book the Room .
2. All through voice to text classifications

In [53]:
df = pd.DataFrame(columns = ['Room Type', 'Customer Name', 'Mobile Number', 'Address', 'ID proof', 
                             'Date of arrival', 'Departure date'])
def voice_recog():
    r = sr.Recognizer()
    audio = []
    text = ''
    flag = True
    while (flag != False):
        with sr.Microphone() as source:
            print('Speak...')
            audio = r.listen(source)
            
            try:
                text = r.recognize_google(audio)
                print('You Said: ', text)
                flag = False
                return text
            except:
                print('Sorry could not recognize! Try Again.')

In [54]:
questions = ["\n2. What's your Name ?", "\n3. What's your Mobile Number ?", "\n4. What's your Address ?",
             "\n5. What's your ID Proof ?", "\n6. What's your Date of Arrival ?",
             "\n7. What's your Date of Departure ?"]

count_DoubleBedRoom = 2
count_SingleBedRoom = 2

def availability(text):
    global count_DoubleBedRoom
    global count_SingleBedRoom
    tokens = word_tokenize(text)
    DoubleRoom = ['double', 'two']
    SingleRoom = ['single', 'one']
    avail = True
    for token in tokens:
        if token in DoubleRoom:
            if count_DoubleBedRoom != 0:
                count_DoubleBedRoom -= 1
                return avail
            else:
                avail = False
                return avail
            break
        elif token in SingleRoom:
            if count_SingleBedRoom != 0:
                count_SingleBedRoom -= 1
                return avail
            else:
                avail = False
                return avail
            break

In [55]:
print('**** Welcome to Chetty Hotel ****\n Answer these questions to Book Room')
i = 0
while (i != -1):
    Customer = []
    print("\n1. What's your Room Type ?")
    text = voice_recog()
    avail = availability(text)
    if avail:
        print('\nRoom Available, continue...')
        Customer.append(text)
        for i in questions:
            print(i)
            Customer.append(voice_recog())
        df.loc[len(df.index)] = Customer
    else:
        print('Room Type you need is not available. Try Again.')
    x = input('Would you like to book room ? (y/n)')
    if x == 'n':
        i = -1

**** Welcome to Chetty Hotel ****
 Answer these questions to Book Room

1. What's your Room Type ?
Speak...
You Said:  single room

Room Available, continue...

2. What's your Name ?
Speak...
You Said:  Jayaraman

3. What's your Mobile Number ?
Speak...
You Said:  934 272 9091

4. What's your Address ?
Speak...
You Said:  Subhash Chandra Bose Street

5. What's your ID Proof ?
Speak...
You Said:  driving licence

6. What's your Date of Arrival ?
Speak...
You Said:  10th April 2016

7. What's your Date of Departure ?
Speak...
You Said:  25th April 2017

1. What's your Room Type ?
Speak...
You Said:  double double

Room Available, continue...

2. What's your Name ?
Speak...
You Said:  Krishna

3. What's your Mobile Number ?
Speak...
You Said:  1 2 3 4 5 6 7 8 9

4. What's your Address ?
Speak...
You Said:  200 Plus

5. What's your ID Proof ?
Speak...
You Said:  voter ID

6. What's your Date of Arrival ?
Speak...
You Said:  25th April 2017

7. What's your Date of Departure ?
Speak...
You S

In [56]:
df

Unnamed: 0,Room Type,Customer Name,Mobile Number,Address,ID proof,Date of arrival,Departure date
0,single room,Jayaraman,934 272 9091,Subhash Chandra Bose Street,driving licence,10th April 2016,25th April 2017
1,double double,Krishna,1 2 3 4 5 6 7 8 9,200 Plus,voter ID,25th April 2017,10th may 2017
2,single room,Kripa Karen,54768 92,Kanyakumari,driving licence,25th may 2016,28 may 2016
