In [1]:
import pandas as pd
import json
import numpy as np

In [16]:
# Load JSON data
id_to_features = json.load(open('musae_git_features.json'))

# Create a DataFrame
df = pd.DataFrame({"features": list(id_to_features.values())})

# Use pd.Series to ensure compatibility with value_counts
matrix = df['features'].apply(lambda x: pd.Series(x).value_counts()).fillna(0).astype(int)

# Extract keys as indices
ids = list(id_to_features.keys())
matrix.index = ids

# Reindex matrix columns alphabetically
matrix = matrix.reindex(sorted(matrix.columns), axis=1)

In [17]:
target = pd.read_csv('musae_git_target.csv', usecols=['name', 'ml_target'])
matrix.reset_index(drop=True, inplace=True)
target.reset_index(drop=True, inplace=True)
table = pd.concat([matrix, target], axis=1)
table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3997,3998,3999,4000,4001,4002,4003,4004,name,ml_target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Eiryyy,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,shawflying,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,JpMCarrilho,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,SuhwanCha,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,sunilangadi2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,shawnwanderson,1
37696,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,kris-ipeh,0
37697,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,qpautrat,0
37698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Injabie3,1


In [18]:
# Check unique values for each column
table.nunique()

Unnamed: 0,0
0,2
1,2
2,2
3,2
4,2
...,...
4002,2
4003,2
4004,2
name,37700


In [19]:
# Check unique values in every column
table.apply(lambda x: x.unique())

Unnamed: 0,0
0,"[0, 1]"
1,"[0, 1]"
2,"[0, 1]"
3,"[0, 1]"
4,"[0, 1]"
...,...
4002,"[0, 1]"
4003,"[0, 1]"
4004,"[0, 1]"
name,"[Eiryyy, shawflying, JpMCarrilho, SuhwanCha, s..."


In [21]:
# Fill numerical columns with the median
#numerical_columns = table.select_dtypes(include=[np.number]).columns
#for col in numerical_columns:
    #table[col] = table[col].fillna(table[col].median())

In [24]:
# Fill categorical columns with the mode
categorical_columns = table.select_dtypes(include=['object', 'category']).columns
for col in categorical_columns:
    table[col] = table[col].fillna(table[col].mode()[0])

In [20]:
missing_values = table.isnull().sum().sum()
print(missing_values)

0


In [7]:
# Number of features
num_features = matrix.shape[1]
print(f"Number of features: {num_features}")

Number of features: 4005


In [22]:
print(table.head())

   0  1  2  3  4  5  6  7  8  9  ...  3997  3998  3999  4000  4001  4002  \
0  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   

   4003  4004          name  ml_target  
0     0     0        Eiryyy          0  
1     0     0    shawflying          0  
2     0     0   JpMCarrilho          1  
3     1     0     SuhwanCha          0  
4     0     0  sunilangadi2          1  

[5 rows x 4007 columns]


In [8]:
from sklearn.preprocessing import StandardScaler

In [12]:
# Ensure all column names are strings
table_standardized = table.copy()

# Convert the column names to strings
table_standardized.columns = table_standardized.columns.astype(str)

# Select numerical columns (after ensuring column names are strings)
numerical_columns = table_standardized.select_dtypes(include=[np.number]).columns

# Standardize the numerical columns
table_standardized[numerical_columns] = scaler.fit_transform(table_standardized[numerical_columns])

# Check the standardized table
print(table_standardized.head())


          0        1        2         3         4         5         6  \
0 -0.019951 -0.03457 -0.02124 -0.011517 -0.012617 -0.017844 -0.021856   
1 -0.019951 -0.03457 -0.02124 -0.011517 -0.012617 -0.017844 -0.021856   
2 -0.019951 -0.03457 -0.02124 -0.011517 -0.012617 -0.017844 -0.021856   
3 -0.019951 -0.03457 -0.02124 -0.011517 -0.012617 -0.017844 -0.021856   
4 -0.019951 -0.03457 -0.02124 -0.011517 -0.012617 -0.017844 -0.021856   

          7         8         9  ...      3997      3998     3999      4000  \
0 -0.013628 -0.063835 -0.017844  ... -0.037165 -0.020605 -0.03259 -0.105244   
1 -0.013628 -0.063835 -0.017844  ... -0.037165 -0.020605 -0.03259 -0.105244   
2 -0.013628 -0.063835 -0.017844  ... -0.037165 -0.020605 -0.03259 -0.105244   
3 -0.013628 -0.063835 -0.017844  ... -0.037165 -0.020605 -0.03259 -0.105244   
4 -0.013628 -0.063835 -0.017844  ... -0.037165 -0.020605 -0.03259 -0.105244   

       4001      4002      4003      4004          name  ml_target  
0 -0.007284 -0.03