In [1]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [2]:
# Load the combined dataset from the CSV file
data = pd.read_csv("/content/merged_dataset.csv")

# Select relevant columns for model training
X = data['Description']  # Features
y = data['Category']  # Target

# Convert the target variable 'Category' to binary labels
mlb = MultiLabelBinarizer()
y_binary = mlb.fit_transform(y.str.split(','))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Define TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Lemmatization function
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Preprocess the text data
X_train_lemmatized = X_train.apply(lambda x: lemmatize_text(x.lower()))
X_test_lemmatized = X_test.apply(lambda x: lemmatize_text(x.lower()))

# Transform text data into numerical features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_lemmatized)
X_test_tfidf = tfidf_vectorizer.transform(X_test_lemmatized)

# Define Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = rf_classifier.predict(X_test_tfidf)

# Print classification report
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

              precision    recall  f1-score   support

       Asset       1.00      1.00      1.00       436
      Equity       1.00      1.00      1.00       507
     Expense       1.00      1.00      1.00       514
   Liability       1.00      1.00      1.00       477
Other Income       1.00      1.00      1.00        11
     Revenue       1.00      1.00      1.00       555

   micro avg       1.00      1.00      1.00      2500
   macro avg       1.00      1.00      1.00      2500
weighted avg       1.00      1.00      1.00      2500
 samples avg       1.00      1.00      1.00      2500



In [4]:
# Preprocess input description
def preprocess_input(description):
    # Lemmatize and lower the input description
    processed_description = lemmatize_text(description.lower())
    return processed_description

# Predict categories for input description
def predict_categories(description):
    # Preprocess input description
    processed_description = preprocess_input(description)

    # Transform input description using TF-IDF vectorizer
    description_tfidf = tfidf_vectorizer.transform([processed_description])

    # Predict categories
    predicted_categories = rf_classifier.predict(description_tfidf)

    # Decode predicted labels
    predicted_categories_decoded = mlb.inverse_transform(predicted_categories)

    return predicted_categories_decoded

# Main function
def main():
    while True:
        # Input description
        description = input("Enter description (or 'exit' to quit): ")

        # Check for exit condition
        if description.lower() == 'exit':
            print("Exiting program.")
            break

        # Predict categories
        predicted_categories = predict_categories(description)

        # Display predicted categories
        print("Predicted categories:", predicted_categories)

# Run main function
if __name__ == "__main__":
    main()


Enter description (or 'exit' to quit): Capital invested by a new partner
Predicted categories: [('Equity',)]
Enter description (or 'exit' to quit): exit
Exiting program.
