In [1]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import os

In [2]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/happygaming/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/happygaming/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/happygaming/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
f = open('./simulated_projects/simulated_connections.json', 'r')
data = json.load(f)

In [4]:
os.chdir("../")
print(os.getcwd())

/home/happygaming/workspace/pMage/src/main/python


In [5]:
import related_words

In [6]:
def collect_representatives(data):
    for connection in data:
        connection["representatives"] = [connection["project_name"]] + connection["tasks"]

In [7]:
def preprocess_text(text, tfidf=False):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove punctuation and stopwords, and lemmatize
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha() and word.lower() not in stop_words]
    
    if tfidf:
        return ' '.join(tokens)
    else:
        return tokens

In [8]:
def extract_common_terms_tfidf(connection_rep, max_features = 10):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
    texts2 = [preprocess_text(text, True) for text in connection_rep]
    X = vectorizer.fit_transform(texts2)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_matrix = X.toarray()
    aggregated_tfidf = np.sum(tfidf_matrix, axis=0)

    # Create a list of (term, aggregated_tfidf_value) tuples
    term_tfidf_pairs = [(feature_names[i], aggregated_tfidf[i]) for i in range(len(feature_names))]

    # Sort the list by aggregated_tfidf_value in descending order
    sorted_term_tfidf_pairs = sorted(term_tfidf_pairs, key=lambda x: x[1], reverse=True)
    
    return sorted_term_tfidf_pairs

In [9]:
def collect_related_words(tokens):
    related_tokens = related_words.extract_related_words(tokens)
    expanded_tokens = set(np.concatenate(list(related_tokens.values())))
    expanded_tokens.update(list(related_tokens.keys()))

    return list(expanded_tokens)

In [10]:
new_data = data[:500]

In [11]:
new_data[400]

{'timestamp': '2024-08-03T09:29:58.635053',
 'process_instance_id': 275,
 'project_name': 'Implement E-commerce Web Application',
 'project_domain': 'Software Engineering',
 'app_name': 'GitHub',
 'app_location': 'https://github.com/[Username]/[Project]',
 'pms_name': 'jBPM',
 'pms_location': 'http://localhost:8080/kie-server/services/rest',
 'user_name': 'Alhaitham',
 'tasks': ['Create Controllers Module',
  'Connect Database',
  'Merge Branches for Final Product',
  'Deploy Web Application']}

In [12]:
collect_representatives(new_data)
for connection in new_data:
    common_terms_tfidf = extract_common_terms_tfidf(connection["representatives"])
    common_terms = []
    for term in common_terms_tfidf:
        common_terms.append(term[0]) 
 
    connection["representatives"] = collect_related_words(common_terms)

In [13]:
# Save data with representative words to a JSON file
with open('./pmage_data_warehouse/simulated_projects/embedded_connections2.json', 'w') as outfile:
    json.dump(new_data, outfile, indent=4)

print(f"Simulated data created with {len(new_data)} interactions.")

Simulated data created with 500 interactions.
