# Environment

In [12]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('airquality',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

Local environment
Root dir: /Users/cuiyuting/Desktop/release_popularity_prediction


In [13]:
import datetime
import requests
import pandas as pd
import hopsworks
import datetime
from pathlib import Path
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")

In [14]:
project = hopsworks.login()

2026-01-11 16:46:59,226 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-11 16:46:59,375 INFO: Initializing external client
2026-01-11 16:46:59,377 INFO: Base URL: https://c.app.hopsworks.ai:443






2026-01-11 16:47:01,344 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1342613


# Trending Info
## Read CSV into Dataframe

In [15]:
today = datetime.date.today()

trending_csv_file=f"{root_dir}/data/trending_history_with_topics.csv"

trending_df_original = pd.read_csv(trending_csv_file, skipinitialspace=True)
trending_df_original

Unnamed: 0,date,repo_name,new_stars,rank,topics
0,2026-01-10,anomalyco/opencode,1052,1,[]
1,2026-01-10,kepano/obsidian-skills,490,2,[]
2,2026-01-10,obra/superpowers,378,3,[]
3,2026-01-10,code-yeongyu/oh-my-opencode,316,4,"['ai', 'ai-agents', 'amp', 'anthropic', 'chatg..."
4,2026-01-10,ChrisWiles/claude-code-showcase,304,5,[]
...,...,...,...,...,...
3745,2025-01-01,bytedance/monolith,576,6,[]
3746,2025-01-01,huggingface/smolagents,501,7,[]
3747,2025-01-01,EbookFoundation/free-programming-books,428,8,"['books', 'education', 'hacktoberfest', 'list'..."
3748,2025-01-01,DigitalPlatDev/FreeDomain,426,9,"['digitalplat', 'domain', 'domain-platform', '..."


In [16]:
trending_df_original.dtypes

date         object
repo_name    object
new_stars     int64
rank          int64
topics       object
dtype: object

## Feature Engineering

In [25]:
# discard useless columns
trending_df = trending_df_original[['date', 'topics']]
trending_df

Unnamed: 0,date,topics
0,2026-01-10,[]
1,2026-01-10,[]
2,2026-01-10,[]
3,2026-01-10,"['ai', 'ai-agents', 'amp', 'anthropic', 'chatg..."
4,2026-01-10,[]
...,...,...
3745,2025-01-01,[]
3746,2025-01-01,[]
3747,2025-01-01,"['books', 'education', 'hacktoberfest', 'list'..."
3748,2025-01-01,"['digitalplat', 'domain', 'domain-platform', '..."


In [26]:
# transforming data type
def parse_topics(topic_str):
    if ',' in topic_str:
        return [t.strip() for t in topic_str.split(',')]
    else:
        return [topic_str.strip()]

trending_df['topics'] = trending_df['topics'].apply(parse_topics)
trending_df['date'] = pd.to_datetime(trending_df['date'])
trending_df

Unnamed: 0,date,topics
0,2026-01-10,[[]]
1,2026-01-10,[[]]
2,2026-01-10,[[]]
3,2026-01-10,"[['ai', 'ai-agents', 'amp', 'anthropic', 'chat..."
4,2026-01-10,[[]]
...,...,...
3745,2025-01-01,[[]]
3746,2025-01-01,[[]]
3747,2025-01-01,"[['books', 'education', 'hacktoberfest', 'list..."
3748,2025-01-01,"[['digitalplat', 'domain', 'domain-platform', ..."


In [27]:
# Aggregate topics by date
def aggregate_topics(group):
    all_topics = []
    for topics in group:
        all_topics.extend(topics)
    return list(set(all_topics))

daily_topics_df = trending_df.groupby('date')['topics'].apply(aggregate_topics).reset_index()
daily_topics_df.columns = ['date', 'all_topics']

daily_topics_df

Unnamed: 0,date,all_topics
0,2025-01-01,"['docker', 'education', 'tts', 'hacktoberfest'..."
1,2025-01-02,"['chatgpt', 'minecraft', 'tutorial-code', 'fre..."
2,2025-01-03,"['nlp', 'minecraft', 'tutorial-code', 'free', ..."
3,2025-01-04,"['swarm', 'nlp', 'tutorial-code', 'minecraft',..."
4,2025-01-05,"['obsidian', 'image-generation', 'nlp', 'chatg..."
...,...,...
370,2026-01-06,"['landing-page', 'artificial-intelligence', 'e..."
371,2026-01-07,"['landing-page', 'artificial-intelligence', 'e..."
372,2026-01-08,"['landing-page', 'chatgpt', 'orchestration', '..."
373,2026-01-09,"['landing-page', 'daisydisk', 'chatgpt', 'orch..."


## Create Feature Groups and insert DataFrames

In [28]:
fs = project.get_feature_store()

In [29]:
trending_fg = fs.get_or_create_feature_group(
    name='trending_info',
    description='Github trending repository topics aggrated daily',
    version=1,
    primary_key=['date'],
    event_time="date"
)

In [30]:
trending_fg.insert(daily_topics_df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1342613/fs/1331268/fg/1938762


Uploading Dataframe: 100.00% |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| Rows 375/375 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: trending_info_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1342613/jobs/named/trending_info_1_offline_fg_materialization/executions


(Job('trending_info_1_offline_fg_materialization', 'SPARK'), None)

# Release Info
## Read CSV into Dataframe

In [31]:
today = datetime.date.today()

release_csv_file=f"{root_dir}/data/release_raw_data.csv"

release_df_original = pd.read_csv(release_csv_file, skipinitialspace=True)
release_df_original

Unnamed: 0,full_name,repo_stars,repo_forks,repo_watchers,language,repo_created_at,repo_updated_at,topics,release_name,release_body,author_followers,author_public_repos,author_type,published_at,prerelease,draft,first_week_star
0,HANCORE-linux/waybar-themes/v2.1.4,113,1,113,CSS,2025-09-06T22:08:06Z,2026-01-09T15:00:36Z,[],v2.1.4,"# ðŸŽ‰ New Waybar Release ""V2.1a & V2.1b â€“ Oxoca...",52,21,User,2026-01-08T20:07:56Z,False,False,3
1,HANCORE-linux/waybar-themes/v2.1.3,113,1,113,CSS,2025-09-06T22:08:06Z,2026-01-09T15:00:36Z,[],v2.1.3,## ðŸ”§ Changes\r\n### V3-min3\r\n- Adjusted back...,52,21,User,2026-01-05T23:56:48Z,False,False,11
2,HANCORE-linux/waybar-themes/v2.1.2,113,1,113,CSS,2025-09-06T22:08:06Z,2026-01-09T15:00:36Z,[],v2.1.2,# Changes\r\n#### V3-min3\r\n- reworked entire...,52,21,User,2026-01-03T22:20:14Z,False,False,12
3,HANCORE-linux/waybar-themes/v2.1.1,113,1,113,CSS,2025-09-06T22:08:06Z,2026-01-09T15:00:36Z,[],v2.1.1,"# ðŸŽ‰ New Waybar Release ""V6.fa""\r\n- Changed f...",52,21,User,2026-01-02T20:03:09Z,False,False,14
4,HANCORE-linux/waybar-themes/v2.1.0,113,1,113,CSS,2025-09-06T22:08:06Z,2026-01-09T15:00:36Z,[],v2.1.0,"# ðŸŽ‰ New Waybar Release ""V6.f""\r\n- Cycles thr...",52,21,User,2026-01-02T14:25:38Z,False,False,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,whyisdifficult/jiratui/v1.2.0,1367,40,1367,Python,2025-08-04T15:10:45Z,2026-01-09T22:45:43Z,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v1.2.0,## What's Changed\r\n\r\n* Add completions Sub...,30,1,User,2025-10-04T11:01:46Z,False,False,61
4045,whyisdifficult/jiratui/v1.1.0,1367,40,1367,Python,2025-08-04T15:10:45Z,2026-01-09T22:45:43Z,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v1.1.0,## What's Changed\r\n\r\n## Added\r\n\r\n* Add...,30,1,User,2025-09-20T13:40:38Z,False,False,32
4046,whyisdifficult/jiratui/v1.0.0,1367,40,1367,Python,2025-08-04T15:10:45Z,2026-01-09T22:45:43Z,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v1.0.0,## What's Changed\r\n\r\n### Breaking Changes\...,30,1,User,2025-09-16T17:21:47Z,False,False,63
4047,whyisdifficult/jiratui/v0.2.0,1367,40,1367,Python,2025-08-04T15:10:45Z,2026-01-09T22:45:43Z,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v0.2.0,## What's Changed\r\n\r\n### Application\r\n* ...,30,1,User,2025-09-13T10:07:39Z,False,False,125


In [32]:
release_df_original.dtypes

full_name              object
repo_stars              int64
repo_forks              int64
repo_watchers           int64
language               object
repo_created_at        object
repo_updated_at        object
topics                 object
release_name           object
release_body           object
author_followers        int64
author_public_repos     int64
author_type            object
published_at           object
prerelease               bool
draft                    bool
first_week_star         int64
dtype: object

In [33]:
date_cols = ['repo_created_at', 'repo_updated_at', 'published_at']
for col in date_cols:
    release_df_original[col] = pd.to_datetime(release_df_original[col])
release_df_original.dtypes

full_name                           object
repo_stars                           int64
repo_forks                           int64
repo_watchers                        int64
language                            object
repo_created_at        datetime64[ns, UTC]
repo_updated_at        datetime64[ns, UTC]
topics                              object
release_name                        object
release_body                        object
author_followers                     int64
author_public_repos                  int64
author_type                         object
published_at           datetime64[ns, UTC]
prerelease                            bool
draft                                 bool
first_week_star                      int64
dtype: object

## Feature Engineering

In [76]:
# almost all False in the dataset
release_df = release_df_original.drop(columns=['prerelease', 'draft'])
release_df

Unnamed: 0,full_name,repo_stars,repo_forks,repo_watchers,language,repo_created_at,repo_updated_at,topics,release_name,release_body,author_followers,author_public_repos,author_type,published_at,first_week_star
0,HANCORE-linux/waybar-themes/v2.1.4,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[],v2.1.4,"# ðŸŽ‰ New Waybar Release ""V2.1a & V2.1b â€“ Oxoca...",52,21,User,2026-01-08 20:07:56+00:00,3
1,HANCORE-linux/waybar-themes/v2.1.3,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[],v2.1.3,## ðŸ”§ Changes\r\n### V3-min3\r\n- Adjusted back...,52,21,User,2026-01-05 23:56:48+00:00,11
2,HANCORE-linux/waybar-themes/v2.1.2,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[],v2.1.2,# Changes\r\n#### V3-min3\r\n- reworked entire...,52,21,User,2026-01-03 22:20:14+00:00,12
3,HANCORE-linux/waybar-themes/v2.1.1,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[],v2.1.1,"# ðŸŽ‰ New Waybar Release ""V6.fa""\r\n- Changed f...",52,21,User,2026-01-02 20:03:09+00:00,14
4,HANCORE-linux/waybar-themes/v2.1.0,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[],v2.1.0,"# ðŸŽ‰ New Waybar Release ""V6.f""\r\n- Cycles thr...",52,21,User,2026-01-02 14:25:38+00:00,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,whyisdifficult/jiratui/v1.2.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v1.2.0,## What's Changed\r\n\r\n* Add completions Sub...,30,1,User,2025-10-04 11:01:46+00:00,61
4045,whyisdifficult/jiratui/v1.1.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v1.1.0,## What's Changed\r\n\r\n## Added\r\n\r\n* Add...,30,1,User,2025-09-20 13:40:38+00:00,32
4046,whyisdifficult/jiratui/v1.0.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v1.0.0,## What's Changed\r\n\r\n### Breaking Changes\...,30,1,User,2025-09-16 17:21:47+00:00,63
4047,whyisdifficult/jiratui/v0.2.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v0.2.0,## What's Changed\r\n\r\n### Application\r\n* ...,30,1,User,2025-09-13 10:07:39+00:00,125


In [77]:
# Trending feature
def parse_topics(topic_str):
    if ',' in topic_str:
        return [t.strip() for t in topic_str.split(',')]
    else:
        return [topic_str.strip()]

release_df['topics'] = release_df['topics'].apply(parse_topics)
release_df

Unnamed: 0,full_name,repo_stars,repo_forks,repo_watchers,language,repo_created_at,repo_updated_at,topics,release_name,release_body,author_followers,author_public_repos,author_type,published_at,first_week_star
0,HANCORE-linux/waybar-themes/v2.1.4,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[[]],v2.1.4,"# ðŸŽ‰ New Waybar Release ""V2.1a & V2.1b â€“ Oxoca...",52,21,User,2026-01-08 20:07:56+00:00,3
1,HANCORE-linux/waybar-themes/v2.1.3,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[[]],v2.1.3,## ðŸ”§ Changes\r\n### V3-min3\r\n- Adjusted back...,52,21,User,2026-01-05 23:56:48+00:00,11
2,HANCORE-linux/waybar-themes/v2.1.2,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[[]],v2.1.2,# Changes\r\n#### V3-min3\r\n- reworked entire...,52,21,User,2026-01-03 22:20:14+00:00,12
3,HANCORE-linux/waybar-themes/v2.1.1,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[[]],v2.1.1,"# ðŸŽ‰ New Waybar Release ""V6.fa""\r\n- Changed f...",52,21,User,2026-01-02 20:03:09+00:00,14
4,HANCORE-linux/waybar-themes/v2.1.0,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[[]],v2.1.0,"# ðŸŽ‰ New Waybar Release ""V6.f""\r\n- Cycles thr...",52,21,User,2026-01-02 14:25:38+00:00,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,whyisdifficult/jiratui/v1.2.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"[['atlassian', 'atlassian-jira', 'cli', 'devel...",v1.2.0,## What's Changed\r\n\r\n* Add completions Sub...,30,1,User,2025-10-04 11:01:46+00:00,61
4045,whyisdifficult/jiratui/v1.1.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"[['atlassian', 'atlassian-jira', 'cli', 'devel...",v1.1.0,## What's Changed\r\n\r\n## Added\r\n\r\n* Add...,30,1,User,2025-09-20 13:40:38+00:00,32
4046,whyisdifficult/jiratui/v1.0.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"[['atlassian', 'atlassian-jira', 'cli', 'devel...",v1.0.0,## What's Changed\r\n\r\n### Breaking Changes\...,30,1,User,2025-09-16 17:21:47+00:00,63
4047,whyisdifficult/jiratui/v0.2.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"[['atlassian', 'atlassian-jira', 'cli', 'devel...",v0.2.0,## What's Changed\r\n\r\n### Application\r\n* ...,30,1,User,2025-09-13 10:07:39+00:00,125


In [78]:
trending_df = trending_fg.read()
trending_df

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.65s) 


Unnamed: 0,date,all_topics
0,2025-05-09 00:00:00+00:00,"['chatgpt', 'domain', 'github', [], 'node-base..."
1,2025-09-16 00:00:00+00:00,"['spec', 'tts', 'chatgpt', 'generative-ai', 'd..."
2,2025-01-07 00:00:00+00:00,"['obsidian', 'artificial-intelligence', 'image..."
3,2025-06-19 00:00:00+00:00,"['shadcn-ui', 'etl-framework', 'claude-usage',..."
4,2025-10-03 00:00:00+00:00,"['decentralized', 'enterprise', 'workflow-auto..."
...,...,...
370,2025-10-20 00:00:00+00:00,"['chineseocr', 'pp-ocr', 'spec', 'insomnia-alt..."
371,2025-12-17 00:00:00+00:00,"['artificial-intelligence', 'daisydisk', 'phon..."
372,2025-06-27 00:00:00+00:00,"['hacktoberfest', 'web'], 'chatgpt', 'browser'..."
373,2025-09-29 00:00:00+00:00,"['spec', 'chatgpt', 'windows'], 'dba-roadmap',..."


In [79]:
release_topics_df = release_df.copy()
trending_topics_df = trending_df.copy()
release_topics_df['date_only'] = pd.to_datetime(release_df['published_at']).dt.date
trending_topics_df['date'] = pd.to_datetime(trending_topics_df['date']).dt.date

topic_map = {row['date']: set(row['all_topics']) for _, row in trending_topics_df.iterrows()}

release_topics_df['is_trending'] = release_topics_df.apply(
    lambda row: bool(set(row['topics']) & topic_map.get(row['date_only'], set())),
    axis=1
)

print(f"Success Matching: {release_topics_df['is_trending'].sum()} / {len(release_topics_df)}")
release_topics_df

Success Matching: 2496 / 4049


Unnamed: 0,full_name,repo_stars,repo_forks,repo_watchers,language,repo_created_at,repo_updated_at,topics,release_name,release_body,author_followers,author_public_repos,author_type,published_at,first_week_star,date_only,is_trending
0,HANCORE-linux/waybar-themes/v2.1.4,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[[]],v2.1.4,"# ðŸŽ‰ New Waybar Release ""V2.1a & V2.1b â€“ Oxoca...",52,21,User,2026-01-08 20:07:56+00:00,3,2026-01-08,True
1,HANCORE-linux/waybar-themes/v2.1.3,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[[]],v2.1.3,## ðŸ”§ Changes\r\n### V3-min3\r\n- Adjusted back...,52,21,User,2026-01-05 23:56:48+00:00,11,2026-01-05,True
2,HANCORE-linux/waybar-themes/v2.1.2,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[[]],v2.1.2,# Changes\r\n#### V3-min3\r\n- reworked entire...,52,21,User,2026-01-03 22:20:14+00:00,12,2026-01-03,True
3,HANCORE-linux/waybar-themes/v2.1.1,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[[]],v2.1.1,"# ðŸŽ‰ New Waybar Release ""V6.fa""\r\n- Changed f...",52,21,User,2026-01-02 20:03:09+00:00,14,2026-01-02,True
4,HANCORE-linux/waybar-themes/v2.1.0,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,[[]],v2.1.0,"# ðŸŽ‰ New Waybar Release ""V6.f""\r\n- Cycles thr...",52,21,User,2026-01-02 14:25:38+00:00,15,2026-01-02,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,whyisdifficult/jiratui/v1.2.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"[['atlassian', 'atlassian-jira', 'cli', 'devel...",v1.2.0,## What's Changed\r\n\r\n* Add completions Sub...,30,1,User,2025-10-04 11:01:46+00:00,61,2025-10-04,True
4045,whyisdifficult/jiratui/v1.1.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"[['atlassian', 'atlassian-jira', 'cli', 'devel...",v1.1.0,## What's Changed\r\n\r\n## Added\r\n\r\n* Add...,30,1,User,2025-09-20 13:40:38+00:00,32,2025-09-20,False
4046,whyisdifficult/jiratui/v1.0.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"[['atlassian', 'atlassian-jira', 'cli', 'devel...",v1.0.0,## What's Changed\r\n\r\n### Breaking Changes\...,30,1,User,2025-09-16 17:21:47+00:00,63,2025-09-16,True
4047,whyisdifficult/jiratui/v0.2.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,"[['atlassian', 'atlassian-jira', 'cli', 'devel...",v0.2.0,## What's Changed\r\n\r\n### Application\r\n* ...,30,1,User,2025-09-13 10:07:39+00:00,125,2025-09-13,True


In [81]:
release_topics_df = release_topics_df.drop(columns=['topics'])
release_df = release_topics_df
release_df

Unnamed: 0,full_name,repo_stars,repo_forks,repo_watchers,language,repo_created_at,repo_updated_at,release_name,release_body,author_followers,author_public_repos,author_type,published_at,first_week_star,date_only,is_trending
0,HANCORE-linux/waybar-themes/v2.1.4,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,v2.1.4,"# ðŸŽ‰ New Waybar Release ""V2.1a & V2.1b â€“ Oxoca...",52,21,User,2026-01-08 20:07:56+00:00,3,2026-01-08,True
1,HANCORE-linux/waybar-themes/v2.1.3,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,v2.1.3,## ðŸ”§ Changes\r\n### V3-min3\r\n- Adjusted back...,52,21,User,2026-01-05 23:56:48+00:00,11,2026-01-05,True
2,HANCORE-linux/waybar-themes/v2.1.2,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,v2.1.2,# Changes\r\n#### V3-min3\r\n- reworked entire...,52,21,User,2026-01-03 22:20:14+00:00,12,2026-01-03,True
3,HANCORE-linux/waybar-themes/v2.1.1,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,v2.1.1,"# ðŸŽ‰ New Waybar Release ""V6.fa""\r\n- Changed f...",52,21,User,2026-01-02 20:03:09+00:00,14,2026-01-02,True
4,HANCORE-linux/waybar-themes/v2.1.0,113,1,113,CSS,2025-09-06 22:08:06+00:00,2026-01-09 15:00:36+00:00,v2.1.0,"# ðŸŽ‰ New Waybar Release ""V6.f""\r\n- Cycles thr...",52,21,User,2026-01-02 14:25:38+00:00,15,2026-01-02,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,whyisdifficult/jiratui/v1.2.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,v1.2.0,## What's Changed\r\n\r\n* Add completions Sub...,30,1,User,2025-10-04 11:01:46+00:00,61,2025-10-04,True
4045,whyisdifficult/jiratui/v1.1.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,v1.1.0,## What's Changed\r\n\r\n## Added\r\n\r\n* Add...,30,1,User,2025-09-20 13:40:38+00:00,32,2025-09-20,False
4046,whyisdifficult/jiratui/v1.0.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,v1.0.0,## What's Changed\r\n\r\n### Breaking Changes\...,30,1,User,2025-09-16 17:21:47+00:00,63,2025-09-16,True
4047,whyisdifficult/jiratui/v0.2.0,1367,40,1367,Python,2025-08-04 15:10:45+00:00,2026-01-09 22:45:43+00:00,v0.2.0,## What's Changed\r\n\r\n### Application\r\n* ...,30,1,User,2025-09-13 10:07:39+00:00,125,2025-09-13,True


In [82]:
# Calculate the duration of a repository
release_df['repo_duration'] = (release_df['repo_updated_at'] - release_df['repo_created_at']).dt.days
release_df = release_df.drop(columns=['repo_updated_at', 'repo_created_at'])
release_df

Unnamed: 0,full_name,repo_stars,repo_forks,repo_watchers,language,release_name,release_body,author_followers,author_public_repos,author_type,published_at,first_week_star,date_only,is_trending,repo_duration
0,HANCORE-linux/waybar-themes/v2.1.4,113,1,113,CSS,v2.1.4,"# ðŸŽ‰ New Waybar Release ""V2.1a & V2.1b â€“ Oxoca...",52,21,User,2026-01-08 20:07:56+00:00,3,2026-01-08,True,124
1,HANCORE-linux/waybar-themes/v2.1.3,113,1,113,CSS,v2.1.3,## ðŸ”§ Changes\r\n### V3-min3\r\n- Adjusted back...,52,21,User,2026-01-05 23:56:48+00:00,11,2026-01-05,True,124
2,HANCORE-linux/waybar-themes/v2.1.2,113,1,113,CSS,v2.1.2,# Changes\r\n#### V3-min3\r\n- reworked entire...,52,21,User,2026-01-03 22:20:14+00:00,12,2026-01-03,True,124
3,HANCORE-linux/waybar-themes/v2.1.1,113,1,113,CSS,v2.1.1,"# ðŸŽ‰ New Waybar Release ""V6.fa""\r\n- Changed f...",52,21,User,2026-01-02 20:03:09+00:00,14,2026-01-02,True,124
4,HANCORE-linux/waybar-themes/v2.1.0,113,1,113,CSS,v2.1.0,"# ðŸŽ‰ New Waybar Release ""V6.f""\r\n- Cycles thr...",52,21,User,2026-01-02 14:25:38+00:00,15,2026-01-02,True,124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,whyisdifficult/jiratui/v1.2.0,1367,40,1367,Python,v1.2.0,## What's Changed\r\n\r\n* Add completions Sub...,30,1,User,2025-10-04 11:01:46+00:00,61,2025-10-04,True,158
4045,whyisdifficult/jiratui/v1.1.0,1367,40,1367,Python,v1.1.0,## What's Changed\r\n\r\n## Added\r\n\r\n* Add...,30,1,User,2025-09-20 13:40:38+00:00,32,2025-09-20,False,158
4046,whyisdifficult/jiratui/v1.0.0,1367,40,1367,Python,v1.0.0,## What's Changed\r\n\r\n### Breaking Changes\...,30,1,User,2025-09-16 17:21:47+00:00,63,2025-09-16,True,158
4047,whyisdifficult/jiratui/v0.2.0,1367,40,1367,Python,v0.2.0,## What's Changed\r\n\r\n### Application\r\n* ...,30,1,User,2025-09-13 10:07:39+00:00,125,2025-09-13,True,158


In [83]:
# one-hot author type
release_df['org_author'] = (release_df['author_type'] == 'Organization').astype(int)
release_df['user_author'] = (release_df['author_type'] == 'User').astype(int)
release_df = release_df.drop(columns=['author_type'])
release_df

Unnamed: 0,full_name,repo_stars,repo_forks,repo_watchers,language,release_name,release_body,author_followers,author_public_repos,published_at,first_week_star,date_only,is_trending,repo_duration,org_author,user_author
0,HANCORE-linux/waybar-themes/v2.1.4,113,1,113,CSS,v2.1.4,"# ðŸŽ‰ New Waybar Release ""V2.1a & V2.1b â€“ Oxoca...",52,21,2026-01-08 20:07:56+00:00,3,2026-01-08,True,124,0,1
1,HANCORE-linux/waybar-themes/v2.1.3,113,1,113,CSS,v2.1.3,## ðŸ”§ Changes\r\n### V3-min3\r\n- Adjusted back...,52,21,2026-01-05 23:56:48+00:00,11,2026-01-05,True,124,0,1
2,HANCORE-linux/waybar-themes/v2.1.2,113,1,113,CSS,v2.1.2,# Changes\r\n#### V3-min3\r\n- reworked entire...,52,21,2026-01-03 22:20:14+00:00,12,2026-01-03,True,124,0,1
3,HANCORE-linux/waybar-themes/v2.1.1,113,1,113,CSS,v2.1.1,"# ðŸŽ‰ New Waybar Release ""V6.fa""\r\n- Changed f...",52,21,2026-01-02 20:03:09+00:00,14,2026-01-02,True,124,0,1
4,HANCORE-linux/waybar-themes/v2.1.0,113,1,113,CSS,v2.1.0,"# ðŸŽ‰ New Waybar Release ""V6.f""\r\n- Cycles thr...",52,21,2026-01-02 14:25:38+00:00,15,2026-01-02,True,124,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,whyisdifficult/jiratui/v1.2.0,1367,40,1367,Python,v1.2.0,## What's Changed\r\n\r\n* Add completions Sub...,30,1,2025-10-04 11:01:46+00:00,61,2025-10-04,True,158,0,1
4045,whyisdifficult/jiratui/v1.1.0,1367,40,1367,Python,v1.1.0,## What's Changed\r\n\r\n## Added\r\n\r\n* Add...,30,1,2025-09-20 13:40:38+00:00,32,2025-09-20,False,158,0,1
4046,whyisdifficult/jiratui/v1.0.0,1367,40,1367,Python,v1.0.0,## What's Changed\r\n\r\n### Breaking Changes\...,30,1,2025-09-16 17:21:47+00:00,63,2025-09-16,True,158,0,1
4047,whyisdifficult/jiratui/v0.2.0,1367,40,1367,Python,v0.2.0,## What's Changed\r\n\r\n### Application\r\n* ...,30,1,2025-09-13 10:07:39+00:00,125,2025-09-13,True,158,0,1


In [84]:
# whether release publish date is weekday or weekend
release_df['publish_is_weekday'] = (release_df['published_at'].dt.dayofweek <= 4).astype(int)
release_df

Unnamed: 0,full_name,repo_stars,repo_forks,repo_watchers,language,release_name,release_body,author_followers,author_public_repos,published_at,first_week_star,date_only,is_trending,repo_duration,org_author,user_author,publish_is_weekday
0,HANCORE-linux/waybar-themes/v2.1.4,113,1,113,CSS,v2.1.4,"# ðŸŽ‰ New Waybar Release ""V2.1a & V2.1b â€“ Oxoca...",52,21,2026-01-08 20:07:56+00:00,3,2026-01-08,True,124,0,1,1
1,HANCORE-linux/waybar-themes/v2.1.3,113,1,113,CSS,v2.1.3,## ðŸ”§ Changes\r\n### V3-min3\r\n- Adjusted back...,52,21,2026-01-05 23:56:48+00:00,11,2026-01-05,True,124,0,1,1
2,HANCORE-linux/waybar-themes/v2.1.2,113,1,113,CSS,v2.1.2,# Changes\r\n#### V3-min3\r\n- reworked entire...,52,21,2026-01-03 22:20:14+00:00,12,2026-01-03,True,124,0,1,0
3,HANCORE-linux/waybar-themes/v2.1.1,113,1,113,CSS,v2.1.1,"# ðŸŽ‰ New Waybar Release ""V6.fa""\r\n- Changed f...",52,21,2026-01-02 20:03:09+00:00,14,2026-01-02,True,124,0,1,1
4,HANCORE-linux/waybar-themes/v2.1.0,113,1,113,CSS,v2.1.0,"# ðŸŽ‰ New Waybar Release ""V6.f""\r\n- Cycles thr...",52,21,2026-01-02 14:25:38+00:00,15,2026-01-02,True,124,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,whyisdifficult/jiratui/v1.2.0,1367,40,1367,Python,v1.2.0,## What's Changed\r\n\r\n* Add completions Sub...,30,1,2025-10-04 11:01:46+00:00,61,2025-10-04,True,158,0,1,0
4045,whyisdifficult/jiratui/v1.1.0,1367,40,1367,Python,v1.1.0,## What's Changed\r\n\r\n## Added\r\n\r\n* Add...,30,1,2025-09-20 13:40:38+00:00,32,2025-09-20,False,158,0,1,0
4046,whyisdifficult/jiratui/v1.0.0,1367,40,1367,Python,v1.0.0,## What's Changed\r\n\r\n### Breaking Changes\...,30,1,2025-09-16 17:21:47+00:00,63,2025-09-16,True,158,0,1,1
4047,whyisdifficult/jiratui/v0.2.0,1367,40,1367,Python,v0.2.0,## What's Changed\r\n\r\n### Application\r\n* ...,30,1,2025-09-13 10:07:39+00:00,125,2025-09-13,True,158,0,1,0


In [85]:
# one-hot programming language
top_n = 20
top_languages = release_df['language'].value_counts().head(top_n).index
release_df['language_group'] = release_df['language'].where(release_df['language'].isin(top_languages), 'Other')
print("Language Distribution:")
print(release_df['language_group'].value_counts(normalize=True).head(15))

release_df = pd.concat([release_df.drop(['language', 'language_group'], axis=1), 
                pd.get_dummies(release_df['language_group'], prefix='language_')], axis=1)
release_df.columns = release_df.columns.str.replace('C#', 'Csharp').str.replace('C++', 'Cplusplus')
release_df

Language Distribution:
language_group
TypeScript    0.298345
Python        0.214868
Go            0.076809
JavaScript    0.067177
Rust          0.065942
Other         0.048160
C#            0.036058
Kotlin        0.035564
Swift         0.021487
C             0.019017
Vue           0.016794
Java          0.016547
PHP           0.012349
Shell         0.010620
C++           0.010373
Name: proportion, dtype: float64


Unnamed: 0,full_name,repo_stars,repo_forks,repo_watchers,release_name,release_body,author_followers,author_public_repos,published_at,first_week_star,...,language__Other,language__PHP,language__Python,language__QML,language__Rust,language__Shell,language__Svelte,language__Swift,language__TypeScript,language__Vue
0,HANCORE-linux/waybar-themes/v2.1.4,113,1,113,v2.1.4,"# ðŸŽ‰ New Waybar Release ""V2.1a & V2.1b â€“ Oxoca...",52,21,2026-01-08 20:07:56+00:00,3,...,False,False,False,False,False,False,False,False,False,False
1,HANCORE-linux/waybar-themes/v2.1.3,113,1,113,v2.1.3,## ðŸ”§ Changes\r\n### V3-min3\r\n- Adjusted back...,52,21,2026-01-05 23:56:48+00:00,11,...,False,False,False,False,False,False,False,False,False,False
2,HANCORE-linux/waybar-themes/v2.1.2,113,1,113,v2.1.2,# Changes\r\n#### V3-min3\r\n- reworked entire...,52,21,2026-01-03 22:20:14+00:00,12,...,False,False,False,False,False,False,False,False,False,False
3,HANCORE-linux/waybar-themes/v2.1.1,113,1,113,v2.1.1,"# ðŸŽ‰ New Waybar Release ""V6.fa""\r\n- Changed f...",52,21,2026-01-02 20:03:09+00:00,14,...,False,False,False,False,False,False,False,False,False,False
4,HANCORE-linux/waybar-themes/v2.1.0,113,1,113,v2.1.0,"# ðŸŽ‰ New Waybar Release ""V6.f""\r\n- Cycles thr...",52,21,2026-01-02 14:25:38+00:00,15,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,whyisdifficult/jiratui/v1.2.0,1367,40,1367,v1.2.0,## What's Changed\r\n\r\n* Add completions Sub...,30,1,2025-10-04 11:01:46+00:00,61,...,False,False,True,False,False,False,False,False,False,False
4045,whyisdifficult/jiratui/v1.1.0,1367,40,1367,v1.1.0,## What's Changed\r\n\r\n## Added\r\n\r\n* Add...,30,1,2025-09-20 13:40:38+00:00,32,...,False,False,True,False,False,False,False,False,False,False
4046,whyisdifficult/jiratui/v1.0.0,1367,40,1367,v1.0.0,## What's Changed\r\n\r\n### Breaking Changes\...,30,1,2025-09-16 17:21:47+00:00,63,...,False,False,True,False,False,False,False,False,False,False
4047,whyisdifficult/jiratui/v0.2.0,1367,40,1367,v0.2.0,## What's Changed\r\n\r\n### Application\r\n* ...,30,1,2025-09-13 10:07:39+00:00,125,...,False,False,True,False,False,False,False,False,False,False


In [None]:
# # Normalization
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# need_scaling = [
#     'repo_stars',
#     'repo_forks',
#     'repo_watchers',
#     'author_followers',
#     'author_public_repos',
#     'repo_duration'
# ]
# scaled_features = scaler.fit_transform(release_df[need_scaling])
# release_df_scaled = pd.DataFrame(scaled_features, columns=need_scaling, index=release_df.index)
# for col in need_scaling:
#     release_df[col] = release_df_scaled[col]
# release_df

Unnamed: 0,full_name,repo_stars,repo_forks,repo_watchers,topics,release_name,release_body,author_followers,author_public_repos,published_at,...,language__Other,language__PHP,language__Python,language__QML,language__Rust,language__Shell,language__Svelte,language__Swift,language__TypeScript,language__Vue
0,HANCORE-linux/waybar-themes/v2.1.4,-1.824037,-0.368420,-1.824037,[],v2.1.4,"# ðŸŽ‰ New Waybar Release ""V2.1a & V2.1b â€“ Oxoca...",-0.235755,-0.149384,2026-01-08 20:07:56+00:00,...,False,False,False,False,False,False,False,False,False,False
1,HANCORE-linux/waybar-themes/v2.1.3,-1.824037,-0.368420,-1.824037,[],v2.1.3,## ðŸ”§ Changes\r\n### V3-min3\r\n- Adjusted back...,-0.235755,-0.149384,2026-01-05 23:56:48+00:00,...,False,False,False,False,False,False,False,False,False,False
2,HANCORE-linux/waybar-themes/v2.1.2,-1.824037,-0.368420,-1.824037,[],v2.1.2,# Changes\r\n#### V3-min3\r\n- reworked entire...,-0.235755,-0.149384,2026-01-03 22:20:14+00:00,...,False,False,False,False,False,False,False,False,False,False
3,HANCORE-linux/waybar-themes/v2.1.1,-1.824037,-0.368420,-1.824037,[],v2.1.1,"# ðŸŽ‰ New Waybar Release ""V6.fa""\r\n- Changed f...",-0.235755,-0.149384,2026-01-02 20:03:09+00:00,...,False,False,False,False,False,False,False,False,False,False
4,HANCORE-linux/waybar-themes/v2.1.0,-1.824037,-0.368420,-1.824037,[],v2.1.0,"# ðŸŽ‰ New Waybar Release ""V6.f""\r\n- Cycles thr...",-0.235755,-0.149384,2026-01-02 14:25:38+00:00,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,whyisdifficult/jiratui/v1.2.0,-0.485108,-0.307315,-0.485108,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v1.2.0,## What's Changed\r\n\r\n* Add completions Sub...,-0.237363,-0.178537,2025-10-04 11:01:46+00:00,...,False,False,True,False,False,False,False,False,False,False
4045,whyisdifficult/jiratui/v1.1.0,-0.485108,-0.307315,-0.485108,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v1.1.0,## What's Changed\r\n\r\n## Added\r\n\r\n* Add...,-0.237363,-0.178537,2025-09-20 13:40:38+00:00,...,False,False,True,False,False,False,False,False,False,False
4046,whyisdifficult/jiratui/v1.0.0,-0.485108,-0.307315,-0.485108,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v1.0.0,## What's Changed\r\n\r\n### Breaking Changes\...,-0.237363,-0.178537,2025-09-16 17:21:47+00:00,...,False,False,True,False,False,False,False,False,False,False
4047,whyisdifficult/jiratui/v0.2.0,-0.485108,-0.307315,-0.485108,"['atlassian', 'atlassian-jira', 'cli', 'develo...",v0.2.0,## What's Changed\r\n\r\n### Application\r\n* ...,-0.237363,-0.178537,2025-09-13 10:07:39+00:00,...,False,False,True,False,False,False,False,False,False,False


In [86]:
# Check missing values
missing_values = release_df.isnull().sum()
missing_info = pd.DataFrame({
    'Missing nums': missing_values
})
print(missing_info[missing_info['Missing nums'] > 0])

              Missing nums
release_name            73
release_body           172


In [87]:
# Fill missing values
release_df_filled = release_df.copy()
release_df_filled['release_name'] = release_df_filled['release_name'].fillna('')
release_df_filled['release_body'] = release_df_filled['release_body'].fillna('')
release_df_filled

Unnamed: 0,full_name,repo_stars,repo_forks,repo_watchers,release_name,release_body,author_followers,author_public_repos,published_at,first_week_star,...,language__Other,language__PHP,language__Python,language__QML,language__Rust,language__Shell,language__Svelte,language__Swift,language__TypeScript,language__Vue
0,HANCORE-linux/waybar-themes/v2.1.4,113,1,113,v2.1.4,"# ðŸŽ‰ New Waybar Release ""V2.1a & V2.1b â€“ Oxoca...",52,21,2026-01-08 20:07:56+00:00,3,...,False,False,False,False,False,False,False,False,False,False
1,HANCORE-linux/waybar-themes/v2.1.3,113,1,113,v2.1.3,## ðŸ”§ Changes\r\n### V3-min3\r\n- Adjusted back...,52,21,2026-01-05 23:56:48+00:00,11,...,False,False,False,False,False,False,False,False,False,False
2,HANCORE-linux/waybar-themes/v2.1.2,113,1,113,v2.1.2,# Changes\r\n#### V3-min3\r\n- reworked entire...,52,21,2026-01-03 22:20:14+00:00,12,...,False,False,False,False,False,False,False,False,False,False
3,HANCORE-linux/waybar-themes/v2.1.1,113,1,113,v2.1.1,"# ðŸŽ‰ New Waybar Release ""V6.fa""\r\n- Changed f...",52,21,2026-01-02 20:03:09+00:00,14,...,False,False,False,False,False,False,False,False,False,False
4,HANCORE-linux/waybar-themes/v2.1.0,113,1,113,v2.1.0,"# ðŸŽ‰ New Waybar Release ""V6.f""\r\n- Cycles thr...",52,21,2026-01-02 14:25:38+00:00,15,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4044,whyisdifficult/jiratui/v1.2.0,1367,40,1367,v1.2.0,## What's Changed\r\n\r\n* Add completions Sub...,30,1,2025-10-04 11:01:46+00:00,61,...,False,False,True,False,False,False,False,False,False,False
4045,whyisdifficult/jiratui/v1.1.0,1367,40,1367,v1.1.0,## What's Changed\r\n\r\n## Added\r\n\r\n* Add...,30,1,2025-09-20 13:40:38+00:00,32,...,False,False,True,False,False,False,False,False,False,False
4046,whyisdifficult/jiratui/v1.0.0,1367,40,1367,v1.0.0,## What's Changed\r\n\r\n### Breaking Changes\...,30,1,2025-09-16 17:21:47+00:00,63,...,False,False,True,False,False,False,False,False,False,False
4047,whyisdifficult/jiratui/v0.2.0,1367,40,1367,v0.2.0,## What's Changed\r\n\r\n### Application\r\n* ...,30,1,2025-09-13 10:07:39+00:00,125,...,False,False,True,False,False,False,False,False,False,False


## Create Feature Groups and insert DataFrames

In [88]:
fs = project.get_feature_store()

In [89]:
release_fg = fs.get_or_create_feature_group(
    name='release_info',
    description='Github release information',
    version=1,
    primary_key=['full_name'],
    event_time="published_at"
)

In [90]:
release_fg.insert(release_df_filled)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1342613/fs/1331268/fg/1911315


Uploading Dataframe: 100.00% |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| Rows 4049/4049 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: release_info_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1342613/jobs/named/release_info_1_offline_fg_materialization/executions


(Job('release_info_1_offline_fg_materialization', 'SPARK'), None)