In [6]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import os

In [8]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove punctuation and stopwords, and lemmatize
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha() and word.lower() not in stop_words]

    return tokens

In [34]:
def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

In [32]:
def suggest_configuration(new_project_name, username, hist_connections):
    new_project_tokens = preprocess_text(new_project_name)
    similarity_ranking = []
    connection_ranking = {}

    # calculate similarity between history connections and the new project name
    for connection in hist_connections:
        similarity = jaccard_similarity(new_project_tokens, connection["representatives"])
        similarity_ranking.append((connection["process_instance_id"], similarity))
        connection_ranking[connection["process_instance_id"]] = connection

    # descending sort to find top 5 related configurations
    final_ranking = sorted(similarity_ranking, key=lambda x: x[1], reverse=True)[:5]
    selected_connections = [connection_ranking[top_config[0]] for top_config in final_ranking]
    # print(final_ranking, selected_connections)

    # if there is user's old configuration in the top five related
    # take it
    # otherwise, take the most related configuration
    for selection in selected_connections:
        if selection["user_name"] == username:
            return selection

    return selected_connections[0]
    

In [12]:
f = open('./simulated_projects/embedded_connections.json', 'r')
data = json.load(f)

In [17]:
new_project_name = "Implement an E-commerce website using ReactJS"
username = "Clorinde"

In [35]:
suggested_config = suggest_configuration(new_project_name, username, data)

In [36]:
suggested_config

{'timestamp': '2024-07-28T15:49:30.201314',
 'process_instance_id': 2,
 'project_name': 'Implement E-commerce Web Application',
 'project_domain': 'Software Engineering',
 'app_name': 'GitHub',
 'app_location': 'https://github.com/[Username]/[Project]',
 'pms_name': 'BAPE',
 'pms_location': 'http://localhost:8081/api/process-instance',
 'user_name': 'Lisa',
 'tasks': ['Deploy Web Application'],
 'representatives': ['deploys',
  'implemented',
  'online',
  'implement',
  'applied',
  'applications',
  'enforce',
  'deployed',
  'apply',
  'web',
  'website',
  'measures',
  'implementation',
  'deploy',
  'application',
  'user',
  'deployment',
  'troops',
  'internet',
  'applying',
  'users',
  'implementing',
  'websites',
  'deploying']}