In [1]:
import numpy
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Data Collection and Preparation: Gather a dataset that contains pairs of "features" and corresponding "products." 
# Ensure that the dataset covers a wide range of features and their associated products. 
# Clean the dataset by removing any irrelevant or inconsistent data.

data = {
    'source': [
        'jira', 'jira', 'jira', 'jira', 'jira', 'jira', 'jira',
        'github', 'github', 'github', 'github', 'github', 'github',
        'confluence', 'confluence', 'confluence', 'confluence'
              ], 
    'feature': [
        'story', 'epic', 'ticket', 'bug', 'request', 'customer', 'roadmap',
        'repository', 'codebase', 'readme', 'PR', 'change', 'review',
        'wiki', 'document', 'diagram', 'figure'
    ]
}

In [3]:
# Label Encoding: Assign a unique numerical label to each distinct "source" in the dataset. 
# This step is necessary to convert the target variable into a numeric form that can be used for training the model.

sources = data['source']
source_encoder = preprocessing.LabelEncoder()
source_enc = source_encoder.fit_transform(sources)
source_enc

array([2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0])

In [4]:
# Since our input space is made up of many to one mappings we can encode our 1D array of "features" the same way
# with a label encoder.

features = data['feature']
feature_encoder = preprocessing.LabelEncoder()
feature_enc = feature_encoder.fit_transform(features)
feature_enc

array([14,  7, 15,  1, 11,  4, 13, 10,  3,  9,  0,  2, 12, 16,  6,  5,  8])

In [5]:
# Model Training: Choose an appropriate machine learning algorithm for your task, such as a decision tree, 
# random forest, or neural network. Split the dataset into training and testing sets to evaluate the model's performance. 
# Feed the preprocessed "features" and corresponding encoded "products" into the model for training.
X_train, X_test, y_train, y_test = train_test_split(feature_enc, source_enc, random_state=0)

In [6]:
X_train, y_train, X_test, y_test

(array([11, 15,  6,  0, 10,  8,  2,  1, 14,  4,  5, 12]),
 array([2, 2, 0, 1, 1, 0, 1, 2, 2, 2, 0, 1]),
 array([ 7, 13,  3,  9, 16]),
 array([2, 2, 1, 1, 0]))

In [7]:
# Decision Tree Classifier expects a 2d array so reshape our 1d
X_train = X_train.reshape(-1, 1)

dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)

In [8]:
# Prediction: Once the model is trained and evaluated, you can use it to make predictions on new natural language 
# prompts. Preprocess the prompt in the same way as during training, convert it into numerical features, and pass 
# it through the trained model. The model will output the predicted label or "product" based on the given prompt.

# Decision Tree Classifier expects a 2d array so reshape our 1d
X_test = X_test.reshape(-1, 1)
y_pred = dt.predict(X_test)

In [9]:
# Post-processing: If the model's prediction is in the form of numerical labels, 
# you can reverse the label encoding to obtain the actual "product" value.
print(f'Test input: {feature_encoder.inverse_transform(X_test)}')
print(f'Test output: {source_encoder.inverse_transform(y_pred)}')

print(f'Expected output: {source_encoder.inverse_transform(y_test)}')

Test input: ['epic' 'roadmap' 'codebase' 'readme' 'wiki']
Test output: ['confluence' 'github' 'github' 'confluence' 'jira']
Expected output: ['jira' 'jira' 'github' 'github' 'confluence']


  y = column_or_1d(y, warn=True)


In [10]:
# Iterative Refinement: Continuously monitor and analyze the model's performance. Collect additional data if needed 
# and retrain the model to improve its accuracy and generalization capabilities.

In [11]:
# Feature Extraction: Convert the natural language prompts into numerical features that can be used as input for 
# the machine learning model. This step involves transforming the text into a format that the model can understand. 
# Techniques like tokenization, stemming, or lemmatization can be used to preprocess the text and extract relevant 
# information.

test_prompt = "Where can I find the code for image-manager?"

# Step 1. Lemmatize the prompt
# TODO???