In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import numpy as np
import joblib
import re
import nltk
from nltk.corpus import stopwords

# Download the stopwords resource
nltk.download('stopwords')

# Load the data from the CSV file
data = pd.read_csv('cmd_commands.csv')

train_text_columns = ['description1', 'description1', 'description3', 'description4', 'description5', 'description6']
test_text_column = 'description2'
label_column = 'name'

# Combine all training text columns into a single text column for training
data['train_text'] = data[train_text_columns].astype(str).apply(lambda x: ' '.join(x), axis=1)

# Combine the test text column into a single text column for testing
data['test_text'] = data[test_text_column].astype(str)

# Remove special characters and convert to lowercase
data['train_text'] = data['train_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x.lower()))

# Remove stopwords
stop_words = set(stopwords.words('english'))
additional_stop_words = ['a', 'an', 'and', 'for', 'in', 'is', 'it', 'of', 'on', 'or', 'the', 'to', 'with',
                          'seamless', 'comprehensive', 'facilitating', 'enhance', 'empower', 'efficient', 'streamlined']
stop_words.update(additional_stop_words)

data['train_text'] = data['train_text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
data['test_text'] = data['test_text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

# Drop rows with missing values in the training text column
data = data.dropna(subset=['train_text'])

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['train_text'], data[label_column], test_size=0.2, random_state=42)

# Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, train_labels)

# Vectorize the test data
X_test_final = vectorizer.transform(data['test_text'])

# Make predictions on the test set
predictions = classifier.predict(X_test_final)

# Calculate accuracy on the entire dataset
accuracy = accuracy_score(data[label_column], predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Save the trained model for future use
joblib.dump((vectorizer, classifier), 'text_classifier_model.joblib')

# Take user input and classify
user_input = input("Enter a description: ")
user_input = re.sub(r'[^a-zA-Z\s]', '', user_input.lower())
user_input = ' '.join(word for word in user_input.split() if word not in stop_words)
user_input_vectorized = vectorizer.transform([user_input])

# Predict the first 5 labels with probabilities for the given description
top_predictions = classifier.predict_proba(user_input_vectorized)
top_indices = np.argsort(top_predictions[0])[::-1][:5]

print("\nTop 5 Predictions:")
for i in top_indices:
    predicted_label = classifier.classes_[i]
    probability = top_predictions[0][i]

    # Find all occurrences of the predicted label in the dataset
    label_indices = data[data[label_column] == predicted_label].index

    # Print descriptions for all occurrences
    for description_index in label_indices:
        print(f"Command Name: {predicted_label}, Probability: {probability:.5f}, Description: {data.iloc[description_index]['description1']}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 69.70%
Enter a description: change the current path

Top 5 Predictions:
Command Name: PATH, Probability: 0.00615, Description: Display or set a search path for executable files •
Command Name: PUSHD, Probability: 0.00604, Description: Save and then change the current directory •
Command Name: CHDIR, Probability: 0.00600, Description: Change Directory - move to a specific Folder •
Command Name: CD, Probability: 0.00593, Description: Change Directory  move to a specific Folder
Command Name: TLIST, Probability: 0.00582, Description: Task list with full path
