In [6]:
import torch
import torch.nn as nn
import spacy
!python -m spacy download en_core_web_md 

#Feet Forward Neural Network
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size , num_classes):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size) #layers of the NN
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()#Activation Function 
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        #  no activation and no softmax

        return out

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [7]:
import nltk
import re
from nltk.tokenize import regexp_tokenize
def fixLengthening(word):
    fix = re.compile(r"(.)\1{2,}")
    return fix.sub(r"\1\1", word)
sentence = 'hi , i am fernando'
sentence = nltk.word_tokenize(sentence)
#sentence = regexp_tokenize(sentence, pattern="\w+")
print(sentence)

sentence = [fixLengthening(word) for word in sentence]
print(sentence)

['hi', ',', 'i', 'am', 'fernando']
['hi', ',', 'i', 'am', 'fernando']


In [8]:
"""Natural language tool kit is the pyhton libary used for the natural language processing"""
from nltk.tokenize import regexp_tokenize
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings("ignore")
# importing the library  
import language_tool_python   
# creating the tool  
my_tool = language_tool_python.LanguageTool('en-US') 
import numpy as np
stemmer = PorterStemmer()
def tokenize(sentence):
    return regexp_tokenize(sentence, pattern="\w+")
def stem(word):
    word = word.lower()
    word = re.sub(r'\d+', '', word)
    return stemmer.stem(word)
def bag_of_words(tokenized_sentence,all_words):
    tokenized_sentence = [my_tool.correct(word) for word in tokenized_sentence]
    tokenized_sentence = [stem(w) for w in tokenized_sentence]
    
    bag = np.zeros(len(all_words),dtype=np.float32)

    for idx,w in enumerate(all_words):
        if w in tokenized_sentence:
            bag[idx] =1.0
    return bag

sentence = ["hello","how","are","you"]
words = ["hi","hello","I","you","bye","thank","cool"]
bag = bag_of_words(sentence,words)
# print(bag)



# words = ["organize", "organizes", "organizing"]
# words = [stem(w) for w in words]
# print(words)






[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import stopwords
nltk.download('stopwords')

with open('/Users/mac/Desktop/SCIENTIFIC RESEARCH/QA.json','r') as f:
    intents = json.load(f)

all_words =  []

tags = []

xy = [] #will hold both the pattern and then tags 

for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        xy.append((w, tag))# here we are appending the word with its tag
#ignore_words = ['?','!','.',',']
ignore_words = stopwords.words('english')
all_words = [stem(w) for w in all_words if w not in ignore_words]

"""
for w in all_words:
    if w not in ignore_words:
        all_words.append(stem(w))
""" 
all_words = sorted(set(all_words))
tags = sorted(set(tags))
# print(tags)

X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    # X: bag of words for each pattern_sentence
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
    label = tags.index(tag)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)




# print(x_train)
class ChatDataset(Dataset):
    def __init__(self):
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train
    #dataset[idx]
    def __getitem__(self,index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.n_samples
    
#Hyperparameters
batch_size = 64
hidden_size = 1024
output_size = len(tags)
input_size = len(X_train[0])
# print(input_size , len(all_words))
# print(output_size, tags)

learning_rate = 0.001
num_epochs = 1000



dataset = ChatDataset()
train_loader = DataLoader(dataset=dataset , batch_size=batch_size, shuffle=True, num_workers = 0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = NeuralNet(input_size, hidden_size, output_size).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    for (words, labels) in train_loader:
        words = words.to(device)
        labels = labels.to(dtype=torch.long).to(device)
        
        # Forward pass
        outputs = model(words)
        # if y would be one-hot, we must apply
        # labels = torch.max(labels, 1)[1]
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch+1) % 100 == 0:
        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


print(f'final loss: {loss.item():.4f}')


data = {
"model_state": model.state_dict(),
"input_size": input_size,
"hidden_size": hidden_size,
"output_size": output_size,
"all_words": all_words,
"tags": tags
}

FILE = "data.pth"
torch.save(data, FILE)

print(f'training complete. file saved to {FILE}')


[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch [100/1000], Loss: 0.0006
Epoch [200/1000], Loss: 0.0002
Epoch [300/1000], Loss: 0.0001
Epoch [400/1000], Loss: 0.0001
Epoch [500/1000], Loss: 0.0000
Epoch [600/1000], Loss: 0.0000
Epoch [700/1000], Loss: 0.0000
Epoch [800/1000], Loss: 0.0000
Epoch [900/1000], Loss: 0.0000
Epoch [1000/1000], Loss: 0.0000
final loss: 0.0000
training complete. file saved to data.pth


In [16]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

Document1 = "sex am fer"

Document2 = "you am not"

document3 = "i am sex and fer"

document4 = "do u like fer"

corpus = [Document1,Document2,document3,document4]

X_train_counts = count_vect.fit_transform(corpus)

vectorizer = TfidfVectorizer()

trsfm=vectorizer.fit_transform(corpus)
pd.DataFrame(trsfm.toarray(),columns=vectorizer.get_feature_names_out(),index=['Document 1','Document 2','Document 3','Document 4'])

percent = cosine_similarity(trsfm[0:1], trsfm)
print(corpus)
print(percent)
print(corpus[np.where(percent[0] == max(percent[0][1:]))[0][0]])


['sex am fer', 'you am not', 'i am sex and fer', 'do u like fer']
[[1.         0.21908734 0.76782851 0.21908734]]
i am sex and fer


In [17]:
# load the language model
nlp = spacy.load('en_core_web_md')

word1 = 'i am'
word2 = 'you are'
word3 = 'he are'

# convert the strings to spaCy Token objects
token1 = nlp(word1)
token2 = nlp(word2)
token3= nlp(word3)
tokens = [token2,token3]
token = lambda word: nlp(word)[0]  # shortcut to convert string to spacy.Token
score_words = lambda w1, w2: token(w1).similarity(token(w2))

In [18]:
import random
import json

#----------------

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with open('/Users/mac/Desktop/SCIENTIFIC RESEARCH/QA.json', 'r') as json_data:
    intents = json.load(json_data)

FILE = "data.pth"
data = torch.load(FILE)

input_size = data["input_size"]
hidden_size = data["hidden_size"]
output_size = data["output_size"]
all_words = data['all_words']
tags = data['tags']
model_state = data["model_state"]

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()

bot_name = "Fernando"
print("Hi! I am Fernando")
print("I am here to help you feel free to chat with me:-")
print("Let's chat! (type 'quit' to exit)")
print()

with open('/Users/mac/Desktop/SCIENTIFIC RESEARCH/main QA.json', 'r') as json_data:
    main_intents = json.load(json_data)
corpse = []
responses = []
for intent in main_intents['intents']:
    tag = intent['tag']
    response = intent['responses']
    print(tag+"\n")
    corpse.append(tag)# here we are appending the word with its tag
    responses.append(response)

Hi! I am Fernando
I am here to help you feel free to chat with me:-
Let's chat! (type 'quit' to exit)

requirement

scholarships

batchmates

language

research topics

internship



In [22]:
while True:
    
    # sentence = "do you use credit cards?"
    #sentence = input("You: ")
    # given text  
    #----
    sentence = input("You: ")
#student = tokenize(student)
#student = [stem(w) for w in student if w not in ignore_words]
#student = sorted(set(student))
    if(any(sentence.lower()==item.lower() for item in ["quit","finish","over","bye","goodbye","see you later"])):
        print(f"{bot_name}: Goodbye , have a nice day")
        break
    
    similarity = []
    for i in corpse:
        similarity.append(score_words(sentence,i))
    #print(similarity)
    if(max(similarity) > 0.5 and len(tokenize(sentence))==1 ):
        print(f"{bot_name}: "+responses[similarity.index(max(similarity))][0])
    else:
        
        sentence = my_tool.correct(sentence)  
        #------

        sentence = tokenize(sentence)
        X = bag_of_words(sentence, all_words)
        X = X.reshape(1, X.shape[0])
        X = torch.from_numpy(X).to(device)

        output = model(X)
        _, predicted = torch.max(output, dim=1)

        tag = tags[predicted.item()]

        probs = torch.softmax(output, dim=1)
        prob = probs[0][predicted.item()]

        if prob.item() > 0.1:
            for intent in intents['intents']:
                if tag == intent["tag"]:
                    print(f"{bot_name}: {random.choice(intent['responses'])}")
        else:
            print(f"{bot_name}: Sorry I am unable to Process Your Request")
            print(f"{bot_name}: You may find the way forward in https://en.itmo.ru/en/viewjep/2/5/Big_Data_and_Machine_Learning.htm")

You: when is deadline
Fernando: August 11, 2023
You: before when i need to submit my document
Fernando: Your portfolio has to contain the following documents:
- professional (and academic) CV;
- bachelor's diploma with transcript of records (in case of studying in the last course transcript of records is enough);
- proof of English language proficiency (certificate or any type of official document). 
We recommend adding to your portfolio documents confirming your relevant experience: letters of recommendation , course completion certificates, winner's diplomas,  etc.
All documents are submitted in the electronic form (scanned copies).
You: what is deadline to submit documents
Fernando: August 11, 2023
You: how do i submit my paper
Fernando: You need to fill out and submit your application at https://signup.itmo.ru/master
For further information please contact the International Office via email: international@itmo.ru.
You: is it okay to work when i study ?
Fernando: Students can be empl

KeyboardInterrupt: Interrupted by user