In [67]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

In [68]:
df = pd.read_csv("stories.csv")
df.head(2)

Unnamed: 0,storyType,storyPriority,storyTitle,storyPoints,spentHours,userPreviousSuccessRate,userAverageSpentHours,completed,userSkills,skills,tags
0,2,1,Fix styling issues on dashboard,12,20,1.0,14.601895,False,[],"['Frontent', 'JavaScript']","['UI', 'frontend']"
1,3,1,Resolve logout session bug,3,6,1.0,14.335423,True,['.NET'],"['Backend', 'Python']","['sessions', 'critical', 'bug']"


In [69]:
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
import numpy as np

In [None]:
tags_list = df["tags"].tolist()
word2vec_model = Word2Vec(sentences=tags_list, vector_size=50, window=2, min_count=1)

In [71]:
def aggregate_embeddings(tags, model):
    embeddings = [model.wv[tag] for tag in tags if tag in model.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

In [72]:
df["tag_embeddings"] = df["tags"].apply(lambda tags: aggregate_embeddings(tags, word2vec_model))
tag_embeddings = pd.DataFrame(df["tag_embeddings"].tolist(), 
                            columns=[f"tag_dim_{i}" for i in range(len(df["tag_embeddings"][0]))])

In [73]:
import hashlib

In [74]:
def skill_col_name(col_name, is_user=False):
    prefix = "user_" if is_user else ""
    return f"{prefix}skill_{hashlib.md5(col_name.encode()).hexdigest()[:8]}"

In [75]:
mlb = MultiLabelBinarizer()
skills_encoded = pd.DataFrame(mlb.fit_transform(df['skills']), 
                            columns=[skill_col_name(cls) for cls in mlb.classes_], 
                            index=df.index)

user_skills_encoded = pd.DataFrame(mlb.fit_transform(df['userSkills']), 
                            columns=[skill_col_name(cls, True) for cls in mlb.classes_], 
                            index=df.index)

In [76]:
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader
import torch

In [77]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

def get_title_embeddings(batch_size=16):
    dataloader = DataLoader(df["storyTitle"].tolist(), batch_size=batch_size)
    embeddings = []
    
    for batch in dataloader:
        inputs = tokenizer(list(batch), return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.extend(batch_embeddings)
            
    pca = PCA(n_components=50)
    reduced_embeddings = pca.fit_transform(embeddings)
    reduced_df = pd.DataFrame(reduced_embeddings, columns=[f"title_emb{i}" for i in range(50)])
    
    return reduced_df

In [78]:
title_embeddings = get_title_embeddings()

In [81]:
df = pd.concat([df, title_embeddings, skills_encoded, user_skills_encoded, tag_embeddings], axis=1)
df = df.drop(['storyTitle', 'tag_embeddings', 'skills', 'tags', 'userSkills'], axis=1)

In [83]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [84]:
target_var = 'completed'

X = df.drop(columns=[target_var])
y = df[target_var]

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [86]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

In [87]:
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8
}

In [88]:
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=100,
    callbacks=[lgb.early_stopping(20)]
)

[LightGBM] [Info] Number of positive: 1086, number of negative: 2959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002696 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6788
[LightGBM] [Info] Number of data points in the train set: 4045, number of used features: 174
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.268480 -> initscore=-1.002350
[LightGBM] [Info] Start training from score -1.002350
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.67975	valid_1's auc: 0.526101


In [89]:
model.save_model('recommender_model.txt')

<lightgbm.basic.Booster at 0x1fca32b00d0>