In [18]:
import pandas as pd

df = pd.read_csv('training.csv', quotechar='"', escapechar='\\', on_bad_lines='skip')
df.head()

Unnamed: 0,input,response
0,Hi,Hello!
1,How are you?,I'm good thank you!
2,What is your name?,I am a chatbot.
3,Goodbye,Goodbye!
4,Tell me a joke,"Sure, why don't scientists trust atoms? Becaus..."


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import nltk

# Download NLTK data
nltk.download('punkt')


# Prepare data
X = df['input']
y = df['response']

# Check the number of unique responses
unique_responses = y.nunique()
if unique_responses < 2:
    raise ValueError(f"The dataset contains only {unique_responses} unique class(es). Please provide more diverse data.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that vectorizes the text and then applies a logistic regression model
model = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Train the model
model.fit(X_train, y_train)



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/markvalentino/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('logisticregression', LogisticRegression())])

In [15]:
# Function to get a response from the chatbot
def get_response(user_input):
    return model.predict([user_input])[0]

# Chat loop
print("Start chatting with the bot (type 'quit' to stop)!")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = get_response(user_input)
    print(f"Bot: {response}")

Start chatting with the bot (type 'quit' to stop)!
You: why
Bot: Hello!
You: hi
Bot: Hello!
You: time
Bot: I don't have a clock, but it's always a good time to chat with you!


KeyboardInterrupt: Interrupted by user

In [16]:
import json
import numpy as np

# Serialize the TF-IDF vectorizer
vectorizer_dict = {
    'vocabulary': model.named_steps['tfidfvectorizer'].vocabulary_,
    'idf': model.named_steps['tfidfvectorizer'].idf_.tolist()
}

with open('vectorizer.json', 'w') as f:
    json.dump(vectorizer_dict, f)

# Serialize the logistic regression model
model_coeffs = model.named_steps['logisticregression'].coef_.tolist()
model_intercept = model.named_steps['logisticregression'].intercept_.tolist()
classes = model.named_steps['logisticregression'].classes_.tolist()

model_data = {
    'coefficients': model_coeffs,
    'intercept': model_intercept,
    'classes': classes
}

with open('model.json', 'w') as f:
    json.dump(model_data, f)