In [1]:
import pandas as pd
import json

In [2]:
# Load JSON data
id_to_features = json.load(open('musae_git_features.json'))

# Create a DataFrame
df = pd.DataFrame({"features": list(id_to_features.values())})

# Use pd.Series to ensure compatibility with value_counts
matrix = df['features'].apply(lambda x: pd.Series(x).value_counts()).fillna(0).astype(int)

# Extract keys as indices
ids = list(id_to_features.keys())
matrix.index = ids

# Reindex matrix columns alphabetically
matrix = matrix.reindex(sorted(matrix.columns), axis=1)

In [3]:
edges = pd.read_csv('musae_git_edges.csv')

In [5]:
target = pd.read_csv('musae_git_target.csv', usecols=['name', 'ml_target'])
matrix.reset_index(drop=True, inplace=True)
target.reset_index(drop=True, inplace=True)
table = pd.concat([matrix, target], axis=1)
table['edges'] = table.index.map(lambda x: edges[edges['id_1'] == x]['id_2'].tolist())
table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3998,3999,4000,4001,4002,4003,4004,name,ml_target,edges
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Eiryyy,0,[23977]
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,shawflying,0,"[34526, 2370, 14683, 29982, 21142, 20363, 2383..."
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,JpMCarrilho,1,[3812]
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,SuhwanCha,0,"[4950, 18029, 3358, 34935, 5916]"
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,sunilangadi2,1,"[2865, 9342]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,shawnwanderson,1,[]
37696,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,kris-ipeh,0,[]
37697,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,qpautrat,0,[]
37698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Injabie3,1,[]


In [16]:
from sklearn.model_selection import train_test_split

X = table.drop(columns=['name', 'ml_target'])
y = table['ml_target']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)  
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) 


In [17]:
X_val

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3996,3997,3998,3999,4000,4001,4002,4003,4004,edges
34581,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
26433,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
35120,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
6425,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[30598, 15936, 36652, 22715, 13566, 18928, 318..."
24182,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,[28957]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[25748]
34431,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
30494,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[35570, 33243, 26788, 34718, 36732, 12354]"
13949,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[34975]


In [18]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data['split'] = 'train'
val_data['split'] = 'val'
test_data['split'] = 'test'

all_data = pd.concat([train_data, val_data, test_data])
all_data.to_csv('all_splits.csv', index=False)

In [19]:
all_data = pd.read_csv('all_splits.csv')
train_data = all_data[all_data['split'] == 'train'].drop(columns=['split'])
val_data = all_data[all_data['split'] == 'val'].drop(columns=['split'])
test_data = all_data[all_data['split'] == 'test'].drop(columns=['split'])
X_train = train_data.drop(columns=['ml_target'])
y_train = train_data['ml_target']
X_test = test_data.drop(columns=['ml_target'])
y_test = test_data['ml_target']
X_val = val_data.drop(columns=['ml_target'])
y_val = val_data['ml_target']

In [20]:
X_val

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3996,3997,3998,3999,4000,4001,4002,4003,4004,edges
30160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
30161,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
30162,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
30163,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[30598, 15936, 36652, 22715, 13566, 18928, 318..."
30164,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,[28957]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33925,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[25748]
33926,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
33927,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[35570, 33243, 26788, 34718, 36732, 12354]"
33928,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[34975]
