In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import numpy as np

# Load the data from the CSV file
data = pd.read_csv('cmd_commands.csv')  # Replace 'cmd_commandssss.csv' with the actual file name

# Replace with the actual column names from your dataset
train_text_columns = ['description1', 'description1', 'description3', 'description4', 'description5', 'description6']
test_text_column = 'description2'
label_column = 'name'  # Replace with the actual label column name

# Combine all training text columns into a single text column for training
data['train_text'] = data[train_text_columns].astype(str).apply(lambda x: ' '.join(x), axis=1)

# Combine the test text column into a single text column for testing
data['test_text'] = data[test_text_column].astype(str)

# Remove ignored words for training and testing
stop_words = ['a', 'an', 'and', 'for', 'in', 'is', 'it', 'of', 'on', 'or', 'the', 'to', 'with',
              'seamless', 'comprehensive', 'facilitating']

data['train_text'] = data['train_text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
data['test_text'] = data['test_text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Drop rows with missing values in the training text column
data = data.dropna(subset=['train_text'])

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['train_text'], data[label_column], test_size=0.2, random_state=42)

# Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, train_labels)

# Vectorize the test data
X_test_final = vectorizer.transform(data['test_text'])

# Make predictions on the test set
predictions = classifier.predict(X_test_final)

# Calculate accuracy on the entire dataset
accuracy = accuracy_score(data[label_column], predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Further train the model on the test set
classifier.fit(X_test, test_labels)

# Take user input and classify
user_input = input("Enter a description: ")
user_input = ' '.join([word for word in user_input.split() if word.lower() not in stop_words])
user_input_vectorized = vectorizer.transform([user_input])

# Predict the first 5 labels with probabilities for the given description
top_predictions = classifier.predict_proba(user_input_vectorized)
top_indices = np.argsort(top_predictions[0])[::-1][:5]

print("\nTop 5 Predictions:")
for i in top_indices:
    predicted_label = classifier.classes_[i]
    probability = top_predictions[0][i]

    # Find all occurrences of the predicted label in the dataset
    label_indices = data[data[label_column] == predicted_label].index

    # Print descriptions for all occurrences
    for description_index in label_indices:
        print(f"Command Name: {predicted_label}, Probability: {probability:.5f}, Description: {data.iloc[description_index]['description1']}")

Accuracy: 70.08%
Enter a description: chane the current path

Top 5 Predictions:
Command Name: BREAK, Probability: 0.02149, Description: Do nothing, successfully •
Command Name: PERMS, Probability: 0.01885, Description: Show permissions for a user
Command Name: ScriptRunner, Probability: 0.01884, Description: Run one or more scripts in sequence
Command Name: CHANGEPK, Probability: 0.01884, Description: Upgrade device Edition/Product Key
Command Name: QAppSrv, Probability: 0.01883, Description: Query TermServer List all servers (TS/Remote Desktop)
