In [314]:
# Importing libraries

import re
import pandas as pd
import sklearn
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
# Importing Csv file

path = "sample_data.csv"
df = pd.read_csv(path)

In [315]:
# Data Preprocessing Steps

df = df[df.apply(lambda row: row.astype(str).str.contains('^\\s*$').any(), axis=1) == False]

df = df.dropna()

query_list = df['text'].tolist()

labels = df['label'].tolist()

# punctuation in order to remove unnecessary signs and unwanted data
query_with_no_punctuation = [re.sub(r'[^\w\s]', '', text).lower() for text in query_list]

# I haven't used print function between cells in order to maintain readability

In [311]:
# Vectorization

vectorizer = TfidfVectorizer()

x_train, x_test, y_train, y_test = train_test_split(query_with_no_punctuation, labels, test_size=0.2)

# Transform training and test data
train_data = vectorizer.fit_transform(x_train)

test_data = vectorizer.transform(x_test)

# Saving TF-IDF training and test features to pickle file
with open('train_data.pkl', 'wb') as f:
    pickle.dump(train_data, f)

with open('test_data.pkl', 'wb') as f:
    pickle.dump(test_data, f)

# Model
model = DecisionTreeClassifier()

# Training model
model.fit(train_data, y_train)

# Saveing  the trained model for use in future

with open('final_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [316]:
# Prediction and accuracy

with open('final_model.pkl', 'rb') as f:
    decision_tree = pickle.load(f)

with open('test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)

predictions = decision_tree.predict(test_data)

# Evaluating
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)

print("Accuracy:", accuracy)

Accuracy: 0.8357501016397886


In [317]:
# After training the model, if anyone wants to check the model with random data, then check below code.

with open('final_model.pkl', 'rb') as f:
    model = pickle.load(f)

query = "api"
query_vector = vectorizer.transform([query])

# Making a prediction
pred = model.predict(query_vector)

print(f"The category of the query is: {pred}")

The category of the query is: ['ft']
