In [1]:
import pandas as pd

df=pd.read_csv('dialogs.txt',sep='\t',names=['User','Chatbot'])
print(df)


                                                   User  \
0                                hi, how are you doing?   
1                         i'm fine. how about yourself?   
2                   i'm pretty good. thanks for asking.   
3                     no problem. so how have you been?   
4                      i've been great. what about you?   
...                                                 ...   
3720    that's a good question. maybe it's not old age.   
3721                              are you right-handed?   
3722                                  yes. all my life.   
3723  you're wearing out your right hand. stop using...   
3724        but i do all my writing with my right hand.   

                                                Chatbot  
0                         i'm fine. how about yourself?  
1                   i'm pretty good. thanks for asking.  
2                     no problem. so how have you been?  
3                      i've been great. what about you?  
4

In [2]:
import string
import re
# importing regular expressions
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
# Lower case conversion
remove_n = lambda x: re.sub("\n", " ", x)
# removing \n and replacing them with empty value
remove_non_ascii = lambda x: re.sub(r'[^\x00-\x7f]',r' ', x)
# removing non ascii characters
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
# removing alpha numeric values
df['User'] = df['User'].map(alphanumeric).map(punc_lower).map(remove_n).map(remove_non_ascii)
# using map function and applying the function on query column
df['Chatbot'] = df['Chatbot'].map(alphanumeric).map(punc_lower).map(remove_n).map(remove_non_ascii)
# using map function and applying the function on response column
print(df)
df.to_csv('modified_dataset.csv', index=False)


                                                   User  \
0                                hi  how are you doing    
1                         i m fine  how about yourself    
2                   i m pretty good  thanks for asking    
3                     no problem  so how have you been    
4                      i ve been great  what about you    
...                                                 ...   
3720    that s a good question  maybe it s not old age    
3721                              are you right handed    
3722                                  yes  all my life    
3723  you re wearing out your right hand  stop using...   
3724        but i do all my writing with my right hand    

                                                Chatbot  
0                         i m fine  how about yourself   
1                   i m pretty good  thanks for asking   
2                     no problem  so how have you been   
3                      i ve been great  what about you   
4

In [3]:
import torch
import spacy
import random
import torch.nn as nn
import torch.optim as optim

# Data Preprocessing
nlp = spacy.load("en_core_web_sm")
with open("modified_dataset.csv", "r") as file:
    lines = file.readlines()
    data = [line.split("\t") for line in lines]


In [4]:
with open("modified_dataset.csv", "r") as file:
    words = file.read().split()
    print(words)
    unique_words = set(words)
    print(len(unique_words))
    print(unique_words)

['User,Chatbot', 'hi', 'how', 'are', 'you', 'doing', ',i', 'm', 'fine', 'how', 'about', 'yourself', 'i', 'm', 'fine', 'how', 'about', 'yourself', ',i', 'm', 'pretty', 'good', 'thanks', 'for', 'asking', 'i', 'm', 'pretty', 'good', 'thanks', 'for', 'asking', ',no', 'problem', 'so', 'how', 'have', 'you', 'been', 'no', 'problem', 'so', 'how', 'have', 'you', 'been', ',i', 've', 'been', 'great', 'what', 'about', 'you', 'i', 've', 'been', 'great', 'what', 'about', 'you', ',i', 've', 'been', 'good', 'i', 'm', 'in', 'school', 'right', 'now', 'i', 've', 'been', 'good', 'i', 'm', 'in', 'school', 'right', 'now', ',what', 'school', 'do', 'you', 'go', 'to', 'what', 'school', 'do', 'you', 'go', 'to', ',i', 'go', 'to', 'pcc', 'i', 'go', 'to', 'pcc', ',do', 'you', 'like', 'it', 'there', 'do', 'you', 'like', 'it', 'there', ',it', 's', 'okay', 'it', 's', 'a', 'really', 'big', 'campus', 'it', 's', 'okay', 'it', 's', 'a', 'really', 'big', 'campus', ',good', 'luck', 'with', 'school', 'good', 'luck', 'with',

In [5]:
print(lines)

['User,Chatbot\n', 'hi  how are you doing ,i m fine  how about yourself \n', 'i m fine  how about yourself ,i m pretty good  thanks for asking \n', 'i m pretty good  thanks for asking ,no problem  so how have you been \n', 'no problem  so how have you been ,i ve been great  what about you \n', 'i ve been great  what about you ,i ve been good  i m in school right now \n', 'i ve been good  i m in school right now ,what school do you go to \n', 'what school do you go to ,i go to pcc \n', 'i go to pcc ,do you like it there \n', 'do you like it there ,it s okay  it s a really big campus \n', 'it s okay  it s a really big campus ,good luck with school \n', 'good luck with school ,thank you very much \n', 'how s it going ,i m doing well  how about you \n', 'i m doing well  how about you ,never better  thanks \n', 'never better  thanks ,so how have you been lately \n', 'so how have you been lately ,i ve actually been pretty good  you \n', 'i ve actually been pretty good  you ,i m actually in s

In [6]:
print(data)

[['User,Chatbot\n'], ['hi  how are you doing ,i m fine  how about yourself \n'], ['i m fine  how about yourself ,i m pretty good  thanks for asking \n'], ['i m pretty good  thanks for asking ,no problem  so how have you been \n'], ['no problem  so how have you been ,i ve been great  what about you \n'], ['i ve been great  what about you ,i ve been good  i m in school right now \n'], ['i ve been good  i m in school right now ,what school do you go to \n'], ['what school do you go to ,i go to pcc \n'], ['i go to pcc ,do you like it there \n'], ['do you like it there ,it s okay  it s a really big campus \n'], ['it s okay  it s a really big campus ,good luck with school \n'], ['good luck with school ,thank you very much \n'], ['how s it going ,i m doing well  how about you \n'], ['i m doing well  how about you ,never better  thanks \n'], ['never better  thanks ,so how have you been lately \n'], ['so how have you been lately ,i ve actually been pretty good  you \n'], ['i ve actually been pr

In [8]:
vocabulary = {"<PAD>": 0, "<UNK>": 1}  # Initialize with special tokens
with open("modified_dataset.csv", "r") as file:
    for line in file:
        words = line.strip().split()  # Split by whitespace for adapting the data format
        for word in words:
            if word not in vocabulary:
                vocabulary[word] = len(vocabulary)


In [9]:
print(vocabulary)

{'<PAD>': 0, '<UNK>': 1, 'User,Chatbot': 2, 'hi': 3, 'how': 4, 'are': 5, 'you': 6, 'doing': 7, ',i': 8, 'm': 9, 'fine': 10, 'about': 11, 'yourself': 12, 'i': 13, 'pretty': 14, 'good': 15, 'thanks': 16, 'for': 17, 'asking': 18, ',no': 19, 'problem': 20, 'so': 21, 'have': 22, 'been': 23, 'no': 24, 've': 25, 'great': 26, 'what': 27, 'in': 28, 'school': 29, 'right': 30, 'now': 31, ',what': 32, 'do': 33, 'go': 34, 'to': 35, 'pcc': 36, ',do': 37, 'like': 38, 'it': 39, 'there': 40, ',it': 41, 's': 42, 'okay': 43, 'a': 44, 'really': 45, 'big': 46, 'campus': 47, ',good': 48, 'luck': 49, 'with': 50, ',thank': 51, 'very': 52, 'much': 53, 'going': 54, 'well': 55, ',never': 56, 'better': 57, 'never': 58, ',so': 59, 'lately': 60, 'actually': 61, ',which': 62, 'attend': 63, 'which': 64, 'attending': 65, ',are': 66, 'enjoying': 67, 'not': 68, 'bad': 69, 'lot': 70, 'of': 71, 'people': 72, 'that': 73, ',thanks': 74, 'today': 75, 'absolutely': 76, 'lovely': 77, 'thank': 78, ',everything': 79, 'everything

In [10]:
def tokenize_and_convert_to_tensor(text, vocabulary, output_size):
    # Tokenize the text using spaCy
    tokens = [token.text for token in nlp(text)]

    # Convert tokens to indices using the provided vocabulary
    indices = [vocabulary.get(token, vocabulary["<UNK>"]) for token in tokens]
    indices = [min(idx, output_size - 1) for idx in indices]

    # Convert the list of indices to a PyTorch tensor
    tensor = torch.LongTensor(indices)

    return tensor


In [11]:
class SimpleChatbot(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleChatbot, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        embedded = self.embedding(input)
        output, hidden = self.gru(embedded)
        output = self.out(output)
        return output, hidden

In [12]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
Col

In [13]:
pip install flask

