In [5]:
import pymongo
import random
import pandas as pd
import related_words

In [3]:
project_name_path = './projectName.txt'
project_name_list = []
with open(project_name_path, 'r') as file:
    content = file.read()
    project_name_list = content.split("\n")

app_path = './app.txt'
app_list = []
with open(app_path, 'r') as file:
    content = file.read()
    app_list = content.split("\n")

user_path = './user.txt'
user_list = []
with open(user_path, 'r') as file:
    content = file.read()
    user_list = content.split("\n")

print(project_name_list, app_list, user_list)

['Revitalize and modernize a legacy software system', 'Develop a new product line that explores innovative technologies', 'Build a comprehensive data analytics and visualization platform', 'Integrate various software tools into a unified ecosystem', 'Create a secure, scalable cloud infrastructure', 'Develop a next-generation user interface for an existing application', 'Implement advanced cybersecurity measures across all products', 'Launch a comprehensive training and development platform for employees', 'Establish a centralized communication and collaboration hub', 'Create a real-time monitoring and feedback system for product performance'] ['gitlab', 'github', 'tableau', 'openclassroom', 'power bi', 'figma', 'draw.io', 'office', 'cisco'] ['clorinde', 'freminet', 'jean', 'emilie', 'kazuha', 'miko']


In [6]:
rules = {}
rules["visual"] = ["tableau", "power bi"]
rules["monitor"] = ["tableau", "power bi"]
rules["software"] = ["gitlab", "github"]
rules["cloud"] = ["gitlab", "github", "cisco"]
rules["implement"] = ["gitlab", "github"]
rules["train"] = ["openclassroom"]
rules["hub"] = ["openclassroom", "cisco"]

In [9]:
tokens = list(rules.keys())
rules_extend = related_words.extract_related_words(tokens)
print(rules_extend)

{'visual': ['auditory', 'aural', 'imagery', 'cinematic', 'conceptual'], 'monitor': ['monitors', 'monitoring', 'monitored', 'supervise', 'assess'], 'software': ['computer', 'microsoft', 'hardware', 'computers', 'internet'], 'cloud': ['clouds', 'ash', 'shadow', 'smoke', 'dust'], 'implement': ['implementing', 'implemented', 'implementation', 'measures', 'enforce'], 'train': ['trains', 'bus', 'rail', 'commuter', 'freight'], 'hub': ['hubs', 'bustling', 'gateway', 'destinations', 'connecting']}


In [10]:
hist = []

# instead of random, we should have some rules !
for _ in range(10000):
    rand_hist = {}
    rand_hist["processName"] = random.choice(project_name_list)
    triggered = False
    for keyword in rules.keys():
        if keyword in rand_hist["processName"].lower():
            triggered = True
            rand_hist["app"] = random.choice(rules[keyword])
            break
        for related_keyword in rules_extend[keyword]:
            if related_keyword in rand_hist["processName"].lower():
                triggered = True
                rand_hist["app"] = random.choice(rules[keyword])
                break
        if triggered:
            break
            
    if not triggered:
        rand_hist["app"] = random.choice(app_list)
    rand_hist["userName"] = random.choice(user_list)
    hist.append(rand_hist)

In [11]:
df = pd.DataFrame.from_dict(hist) 
df.to_csv("hist.csv", index=False)

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import joblib

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iohkg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iohkg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# Example labeled reviews and their categories (labels)
reviews = df["processName"].to_list()
labels = df["app"].to_list()

In [14]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(review):
    tokens = word_tokenize(review.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(tokens)

processed_reviews = [preprocess(review) for review in reviews]

In [15]:
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_reviews)

In [17]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 44.45%


In [21]:
# Save the model and vectorizer
joblib.dump(model, 'model.joblib')
joblib.dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']

In [None]:
# Load the model and vectorizer
loaded_model = joblib.load('model.joblib')
loaded_vectorizer = joblib.load('vectorizer.joblib')

In [19]:
def suggest_product(review):
    processed_review = preprocess(review)
    review_tfidf = loaded_vectorizer.transform([processed_review])
    prediction = loaded_model.predict(review_tfidf)[0]
    return prediction

# Example usage
new_review = "Generating a cybersecurity simulation"
suggestion = suggest_product(new_review)
print(f"Product suggestion: {suggestion}")

Product suggestion: github
