In [40]:
import pandas as pd
import json

In [41]:
# Load JSON data
id_to_features = json.load(open('musae_git_features.json'))

# Create a DataFrame
df = pd.DataFrame({"features": list(id_to_features.values())})

# Use pd.Series to ensure compatibility with value_counts
matrix = df['features'].apply(lambda x: pd.Series(x).value_counts()).fillna(0).astype(int)

# Extract keys as indices
ids = list(id_to_features.keys())
matrix.index = ids

# Reindex matrix columns alphabetically
matrix = matrix.reindex(sorted(matrix.columns), axis=1)

In [42]:
edges = pd.read_csv('musae_git_edges.csv')

In [43]:
target = pd.read_csv('musae_git_target.csv', usecols=['name', 'ml_target'])
matrix.reset_index(drop=True, inplace=True)
target.reset_index(drop=True, inplace=True)
table = pd.concat([matrix, target], axis=1)
table['edges'] = table.index.map(lambda x: edges[edges['id_1'] == x]['id_2'].tolist() if x in edges['id_1'].values else [])
table['num_edges'] = table['edges'].apply(len)
table.drop(columns=['edges'], inplace=True)

table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3998,3999,4000,4001,4002,4003,4004,name,ml_target,num_edges
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Eiryyy,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,shawflying,0,8
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,JpMCarrilho,1,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,SuhwanCha,0,5
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,sunilangadi2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,shawnwanderson,1,0
37696,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,kris-ipeh,0,0
37697,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,qpautrat,0,0
37698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Injabie3,1,0


In [44]:
from sklearn.model_selection import train_test_split

X = table.drop(columns=['name', 'ml_target'])
y = table['ml_target']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)  
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) 


In [45]:
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data['split'] = 'train'
val_data['split'] = 'val'
test_data['split'] = 'test'

all_data = pd.concat([train_data, val_data, test_data])
all_data.to_csv('all_splits.csv', index=False)
test2 = test_data.copy()
test2.drop(columns=["split"], inplace=True) 
test2.to_csv('test.csv',index=False)

In [None]:
all_data = pd.read_csv('all_splits.csv')
train_data = all_data[all_data['split'] == 'train'].drop(columns=['split'])
val_data = all_data[all_data['split'] == 'val'].drop(columns=['split'])
test_data = all_data[all_data['split'] == 'test'].drop(columns=['split'])
X_train = train_data.drop(columns=['ml_target'])
y_train = train_data['ml_target']
X_test = test_data.drop(columns=['ml_target'])
y_test = test_data['ml_target']
X_val = val_data.drop(columns=['ml_target'])
y_val = val_data['ml_target']

In [None]:
import onnxruntime as ort
import numpy as np
from flask import Flask, request, jsonify

app = Flask(__name__)

logRegPath = './models/logreg.onnx'
mlpPath = './models/mlp.onnx'
svmPath = './models/svm.onnx'
knnPath = './models/knn.onnx'
xgbPath = './models/xgb.onnx'
logReg = ort.InferenceSession(logRegPath)
mlp = ort.InferenceSession(mlpPath)
svm = ort.InferenceSession(svmPath)
knn = ort.InferenceSession(knnPath)
xgb = ort.InferenceSession(xgbPath)

@app.route('/prediction', methods=['GET'])
def predict():
    data = request.get_json()

    if 'values' not in data:
        return jsonify({'error': 'Values not provided'}), 400
    model = data['model']
    input_data = np.array(data['values'], dtype=np.float32)
    resolvedModel = None
    if model == 'logreg':
        print('logReg')
        resolvedModel = logReg;
    if(model == 'mlp'):
        print('mlp')
        resolvedModel = mlp;
    if model == 'svm':
        print('svm')
        resolvedModel = svm;
        input_data = input_data[:-1]
    if model == 'knn':
        print('knn')
        resolvedModel = knn;
    if model == 'xgbBoost':
        print('xgbBoost')
        resolvedModel = xgb

    if len(input_data.shape) == 1:
        input_data = np.expand_dims(input_data, axis=0) 
    key = 'input' if model == 'xgbBoost' else 'float_input'
    input_feed = {key: input_data}
    result = resolvedModel.run(None, input_feed)
    inputs = {resolvedModel.get_inputs()[0].name: input_data}
    result = resolvedModel.run(None, inputs)

    return jsonify({'prediction': result[0].tolist()})


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.8.170:5000
Press CTRL+C to quit


svm


127.0.0.1 - - [21/Dec/2024 23:28:50] "GET /prediction HTTP/1.1" 200 -


knn


127.0.0.1 - - [21/Dec/2024 23:28:56] "GET /prediction HTTP/1.1" 200 -
127.0.0.1 - - [21/Dec/2024 23:29:05] "GET /prediction HTTP/1.1" 200 -


xgbBoost


127.0.0.1 - - [21/Dec/2024 23:29:11] "GET /prediction HTTP/1.1" 200 -


mlp


127.0.0.1 - - [21/Dec/2024 23:29:15] "GET /prediction HTTP/1.1" 200 -


logReg
