In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/content/search_no_duplicates.csv')
df

Unnamed: 0,Message,Intent
0,Retrieve information on DIY home renovation id...,Search
1,Explore role of artificial intelligence in mod...,Information
2,Give me an overview of famous painters from th...,Search
3,Find information about revolutionary advanceme...,Search
4,Provide some facts about technological innovat...,Search
...,...,...
483,What can you tell me about process of photosyn...,Information
484,Look up details about overview of World War II.,Information
485,What's the story behind popular travel destina...,Search
486,What can you tell me about solar system?,Search


In [3]:
df['Intent'].value_counts()

Search         269
Information    219
Name: Intent, dtype: int64

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt')
# Split data into features (X) and labels (y)
X = df['Message']
y = df['Intent']

# Map 'Search' to 0 and 'Information' to 1
label_mapping = {'Search': 0, 'Information': 1}
y = y.map(label_mapping)

# Initialize the stemmer
stemmer = PorterStemmer()

# Tokenization and stemming function
def preprocess_text(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text

# Preprocess the text data
X_preprocessed = X.apply(preprocess_text)

# Convert text data to numerical features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X_preprocessed)

# Train a Multinomial Naive Bayes classifier
multinomial_nb_classifier = MultinomialNB(alpha=1.0)  # You can tune the alpha value
multinomial_nb_classifier.fit(X_tfidf, y)

# Test the model on new text data
new_text = ["Share some insights about space exploration missions."]
new_text_preprocessed = pd.Series(new_text).apply(preprocess_text)
new_text_tfidf = tfidf_vectorizer.transform(new_text_preprocessed)
predicted_class = multinomial_nb_classifier.predict(new_text_tfidf)

# Map predicted class back to label
predicted_label = list(label_mapping.keys())[list(label_mapping.values()).index(predicted_class[0])]

print(f"Predicted Intent: {predicted_label}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Predicted Intent: Search


In [6]:
import pickle
# Save the model using pickle
model_filename = 'search_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(multinomial_nb_classifier, file)