In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
path="/content/drive/MyDrive/hex_data - dataclean_refined_data1(1).csv"
data = pd.read_csv(path)

# Preprocess the data
X = data['Description']
y = data['Category']

# Check class distribution
print(y.value_counts())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, min_df=5, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model with class weighting
model = LogisticRegression(class_weight='balanced')
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

# Print classification report and confusion matrix
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Function to predict category from user input
def predict_category(description):
    description_tfidf = vectorizer.transform([description])
    prediction = model.predict(description_tfidf)
    return prediction[0]

# Example usage: input a description and predict category
description_input = input("Enter a description: ")
predicted_category = predict_category(description_input)
print(f'Predicted Category: {predicted_category}')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Category
IT                   24950
Marketing & Sales     7380
Operations            3283
Other                 2767
Finance               2275
HR                    1698
Training              1203
Planning               197
Miscellaneous          134
Logistics               27
Name: count, dtype: int64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Accuracy: 74.94%
Classification Report:
                    precision    recall  f1-score   support

          Finance       0.60      0.81      0.69       460
               HR       0.73      0.73      0.73       365
               IT       0.98      0.76      0.86      5016
        Logistics       0.10      0.33      0.15         6
Marketing & Sales       0.95      0.84      0.89      1473
    Miscellaneous       0.04      0.17      0.07        30
       Operations       0.46      0.67      0.54       650
            Other       0.36      0.54      0.43       516
         Planning       0.05      0.33      0.08        36
         Training       0.34      0.68      0.45       231

         accuracy                           0.75      8783
        macro avg       0.46      0.59      0.49      8783
     weighted avg       0.84      0.75      0.78      8783

Confusion Matrix:
 [[ 372    9    8    4    2    9   17   25    6    8]
 [  14  265    1    0   11    4   16   33   12    9]