In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import json
import os

Part 1: Data Parsing (flatten JSON to csv)

In [2]:
#Load the JSON data
data_folder = "../SGNexData"

In [4]:
for filename in os.listdir(data_folder):
    if filename.endswith('.json'):
        #Construct full path for each JSON file
        json_file_path = os.path.join(data_folder,filename)
        data = []
    
        with open(json_file_path, 'r') as json_file:
            for line in json_file:
                if line.strip(): #Skip empty lines
                    data.append(json.loads(line))
        
        #Prepare a list to store the flattened data
        flattened_data = []

        #Iterate through loaded data
        for entry in data:
            for enst_id, positions in entry.items():
                    for position, sub_dict in positions.items():
                            for key, values in sub_dict.items():
                                        # Store the entire list of values as a string
                                        row = {
                                                'ENST_ID': enst_id,
                                                'Position': position,
                                                'Key': key,
                                                'Values': str(values)  # Store the list as a string
                                        }
                                        flattened_data.append(row)
                                        
        # Convert the list of dictionaries to a DataFrame
        df = pd.DataFrame(flattened_data)
        
        #Save the DataFrame to a CSV file
        csv_file_name = filename.replace('.json','.csv') #Change file extention from .json to .csv
        csv_file_path = os.path.join(data_folder,csv_file_name)

        df.to_csv(csv_file_path, index=False) #Save df to csv file

../SGNexData/Hct116R4R3.json
../SGNexData/HepG2R5R2.json


Part 2: Data Processing

In [3]:
# helper functions
def calculate_average(values):
    arr = np.array(values)
    return arr.mean(axis=0).tolist()

def convert_to_float(value):
        value = eval(value)
        return [[float(elem) for elem in inner] for inner in value]

In [None]:
for filename in os.listdir(data_folder):
    if filename.endswith('.csv'):
        #Construct full path for each dataframe
        df_file_path = os.path.join(data_folder,filename)
        print(df_file_path)
        df = pd.read_csv(df_file_path)

        # Convert values from str to float
        df['Values'] = df['Values'].apply(convert_to_float)

        # Get the avg of all values
        df['Values'] = df['Values'].apply(calculate_average)

        # Split values into individual columns
        values_expanded = df['Values'].apply(pd.Series)
        values_expanded.columns = [f'Value_{i+1}' for i in range(values_expanded.shape[1])]
        clean_df = pd.concat([df, values_expanded], axis=1)
        clean_df.drop(columns=['Values'], inplace=True)

        # Shuffle the entire dataset
        clean_df = clean_df.sample(frac=1, random_state=42).reset_index(drop=True)

        # Convert strings to numerical value
        label_encoder = LabelEncoder()
        clean_df['ENST_ID_encoded'] = label_encoder.fit_transform(clean_df['ENST_ID'])
        clean_df['Key_encoded'] = label_encoder.fit_transform(clean_df['Key'])

        #convert to csv
        csv_file_name, file_extension = os.path.splitext(df_file_path)
        new_file_path = f"{csv_file_name}_processed{file_extension}"
        clean_df.to_csv(new_file_path, index=False)


../SGNexData/HepG2R5R2.csv
../SGNexData/Hct116R4R3.csv
../SGNexData/K562R5R1.csv
../SGNexData/Hct116R3R4.csv
../SGNexData/HepG2R6R1.csv
