In [3]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

In [4]:
df = pd.read_csv('Data/parsed_json.csv')
test_data = pd.read_csv('Data/test_data.csv')
train_data = pd.read_csv('Data/train_data.csv')
val_data = pd.read_csv('Data/val_data.csv')

In [5]:
df

Unnamed: 0,ENST_ID,Position,Key,Values,gene_id,transcript_id,transcript_position,label
0,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",ENSG00000004059,ENST00000000233,244,0
1,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",ENSG00000004059,ENST00000000233,261,0
2,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",ENSG00000004059,ENST00000000233,316,0
3,ENST00000000233,332,AGAACAT,"[[0.0134, 4.71, 132.0, 0.00447, 4.24, 98.8, 0....",ENSG00000004059,ENST00000000233,332,0
4,ENST00000000233,368,AGGACAA,"[[0.015, 6.97, 118.0, 0.0106, 3.04, 123.0, 0.0...",ENSG00000004059,ENST00000000233,368,0
...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,"[[0.0112, 2.96, 116.0, 0.0093, 3.24, 115.0, 0....",ENSG00000167747,ENST00000641834,1348,1
121834,ENST00000641834,1429,CTGACAC,"[[0.00697, 4.25, 112.0, 0.00481, 8.67, 119.0, ...",ENSG00000167747,ENST00000641834,1429,0
121835,ENST00000641834,1531,TGGACAC,"[[0.00996, 3.12, 112.0, 0.00432, 4.5, 115.0, 0...",ENSG00000167747,ENST00000641834,1531,1
121836,ENST00000641834,1537,CTGACCA,"[[0.00396, 3.14, 108.0, 0.00747, 5.79, 125.0, ...",ENSG00000167747,ENST00000641834,1537,0


In [6]:
df.columns.tolist()

['ENST_ID',
 'Position',
 'Key',
 'Values',
 'gene_id',
 'transcript_id',
 'transcript_position',
 'label']

In [7]:
df[df['transcript_id'] != df['ENST_ID']]

Unnamed: 0,ENST_ID,Position,Key,Values,gene_id,transcript_id,transcript_position,label


In [8]:
new_df = df.drop(columns=['transcript_position', 'transcript_id'])

In [9]:
new_df

Unnamed: 0,ENST_ID,Position,Key,Values,gene_id,label
0,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",ENSG00000004059,0
1,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",ENSG00000004059,0
2,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",ENSG00000004059,0
3,ENST00000000233,332,AGAACAT,"[[0.0134, 4.71, 132.0, 0.00447, 4.24, 98.8, 0....",ENSG00000004059,0
4,ENST00000000233,368,AGGACAA,"[[0.015, 6.97, 118.0, 0.0106, 3.04, 123.0, 0.0...",ENSG00000004059,0
...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,"[[0.0112, 2.96, 116.0, 0.0093, 3.24, 115.0, 0....",ENSG00000167747,1
121834,ENST00000641834,1429,CTGACAC,"[[0.00697, 4.25, 112.0, 0.00481, 8.67, 119.0, ...",ENSG00000167747,0
121835,ENST00000641834,1531,TGGACAC,"[[0.00996, 3.12, 112.0, 0.00432, 4.5, 115.0, 0...",ENSG00000167747,1
121836,ENST00000641834,1537,CTGACCA,"[[0.00396, 3.14, 108.0, 0.00747, 5.79, 125.0, ...",ENSG00000167747,0


In [10]:
def calculate_average(values):
    arr = np.array(values)
    return arr.mean(axis=0).tolist()
# Function to convert string representation of list of lists to actual float lists
def convert_to_float(value):
        # Evaluate the string as a list
        value = eval(value)
        # Convert all elements to float
        return [[float(elem) for elem in inner] for inner in value]


In [11]:
temp = new_df.head()
temp

Unnamed: 0,ENST_ID,Position,Key,Values,gene_id,label
0,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",ENSG00000004059,0
1,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",ENSG00000004059,0
2,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",ENSG00000004059,0
3,ENST00000000233,332,AGAACAT,"[[0.0134, 4.71, 132.0, 0.00447, 4.24, 98.8, 0....",ENSG00000004059,0
4,ENST00000000233,368,AGGACAA,"[[0.015, 6.97, 118.0, 0.0106, 3.04, 123.0, 0.0...",ENSG00000004059,0


In [12]:
temp['Values'] = temp['Values'].apply(convert_to_float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['Values'] = temp['Values'].apply(convert_to_float)


In [13]:
for i, inner_list in enumerate(temp['Values'][0]):
    for j, element in enumerate(inner_list):
        print(f"Element at [{i}][{j}] is of type: {type(element)}")

Element at [0][0] is of type: <class 'float'>
Element at [0][1] is of type: <class 'float'>
Element at [0][2] is of type: <class 'float'>
Element at [0][3] is of type: <class 'float'>
Element at [0][4] is of type: <class 'float'>
Element at [0][5] is of type: <class 'float'>
Element at [0][6] is of type: <class 'float'>
Element at [0][7] is of type: <class 'float'>
Element at [0][8] is of type: <class 'float'>
Element at [1][0] is of type: <class 'float'>
Element at [1][1] is of type: <class 'float'>
Element at [1][2] is of type: <class 'float'>
Element at [1][3] is of type: <class 'float'>
Element at [1][4] is of type: <class 'float'>
Element at [1][5] is of type: <class 'float'>
Element at [1][6] is of type: <class 'float'>
Element at [1][7] is of type: <class 'float'>
Element at [1][8] is of type: <class 'float'>
Element at [2][0] is of type: <class 'float'>
Element at [2][1] is of type: <class 'float'>
Element at [2][2] is of type: <class 'float'>
Element at [2][3] is of type: <cla

In [14]:
temp['Values'] = temp['Values'].apply(calculate_average)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['Values'] = temp['Values'].apply(calculate_average)


In [15]:
temp

Unnamed: 0,ENST_ID,Position,Key,Values,gene_id,label
0,ENST00000000233,244,AAGACCA,"[0.008264378378378385, 4.223783783783786, 123....",ENSG00000004059,0
1,ENST00000000233,261,CAAACTG,"[0.006609244186046515, 3.2164244186046504, 109...",ENSG00000004059,0
2,ENST00000000233,316,GAAACAG,"[0.0075699999999999995, 2.94054054054054, 105....",ENSG00000004059,0
3,ENST00000000233,332,AGAACAT,"[0.010620250000000005, 6.4763499999999965, 129...",ENSG00000004059,0
4,ENST00000000233,368,AGGACAA,"[0.010700505050505058, 6.415050505050505, 117....",ENSG00000004059,0


In [16]:
#convert from str to float
new_df['Values'] = new_df['Values'].apply(convert_to_float)
#get the avg
new_df['Values'] = new_df['Values'].apply(calculate_average)

In [17]:
new_df

Unnamed: 0,ENST_ID,Position,Key,Values,gene_id,label
0,ENST00000000233,244,AAGACCA,"[0.008264378378378385, 4.223783783783786, 123....",ENSG00000004059,0
1,ENST00000000233,261,CAAACTG,"[0.006609244186046515, 3.2164244186046504, 109...",ENSG00000004059,0
2,ENST00000000233,316,GAAACAG,"[0.0075699999999999995, 2.94054054054054, 105....",ENSG00000004059,0
3,ENST00000000233,332,AGAACAT,"[0.010620250000000005, 6.4763499999999965, 129...",ENSG00000004059,0
4,ENST00000000233,368,AGGACAA,"[0.010700505050505058, 6.415050505050505, 117....",ENSG00000004059,0
...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,"[0.009593835616438357, 3.294164383561644, 118....",ENSG00000167747,1
121834,ENST00000641834,1429,CTGACAC,"[0.0083927536231884, 4.511014492753626, 110.96...",ENSG00000167747,0
121835,ENST00000641834,1531,TGGACAC,"[0.008160624999999996, 3.9184375, 113.96875, 0...",ENSG00000167747,1
121836,ENST00000641834,1537,CTGACCA,"[0.008043859649122805, 3.191228070175438, 109....",ENSG00000167747,0


In [18]:
values_expanded = new_df['Values'].apply(pd.Series)
values_expanded.columns = [f'Value_{i+1}' for i in range(values_expanded.shape[1])]
clean_df = pd.concat([new_df, values_expanded], axis=1)

In [19]:
clean_df.drop(columns=['Values'], inplace=True)

In [20]:
clean_df

Unnamed: 0,ENST_ID,Position,Key,gene_id,label,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9
0,ENST00000000233,244,AAGACCA,ENSG00000004059,0,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,4.386989,80.570270
1,ENST00000000233,261,CAAACTG,ENSG00000004059,0,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,3.016599,94.290698
2,ENST00000000233,316,GAAACAG,ENSG00000004059,0,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,2.087146,89.364324
3,ENST00000000233,332,AGAACAT,ENSG00000004059,0,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006102,2.236520,89.154000
4,ENST00000000233,368,AGGACAA,ENSG00000004059,0,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,4.260253,85.178788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,ENSG00000167747,1,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,4.005616,82.004110
121834,ENST00000641834,1429,CTGACAC,ENSG00000167747,0,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,3.644638,80.497101
121835,ENST00000641834,1531,TGGACAC,ENSG00000167747,1,0.008161,3.918438,113.968750,0.006877,4.759687,113.562500,0.006410,2.181562,84.190625
121836,ENST00000641834,1537,CTGACCA,ENSG00000167747,0,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,2.540877,82.289474


In [21]:
#check for null values
clean_df.isnull().sum()

ENST_ID     0
Position    0
Key         0
gene_id     0
label       0
Value_1     0
Value_2     0
Value_3     0
Value_4     0
Value_5     0
Value_6     0
Value_7     0
Value_8     0
Value_9     0
dtype: int64

In [22]:
#number of labels
print(clean_df['label'].value_counts())

label
0    116363
1      5475
Name: count, dtype: int64


In [23]:
# Shuffle the entire dataset
clean_df = clean_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [24]:
label_encoder = LabelEncoder()
clean_df['ENST_ID_encoded'] = label_encoder.fit_transform(clean_df['ENST_ID'])
clean_df['Key_encoded'] = label_encoder.fit_transform(clean_df['Key'])
clean_df['gene_id_encoded'] = label_encoder.fit_transform(clean_df['gene_id'])

In [25]:
clean_df = clean_df.drop(columns='label').join(clean_df['label'])

In [26]:
X = clean_df.drop(columns=['label'])
y = clean_df['label']

In [27]:
clean_df

Unnamed: 0,ENST_ID,Position,Key,gene_id,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9,ENST_ID_encoded,Key_encoded,gene_id_encoded,label
0,ENST00000514332,579,CAAACAA,ENSG00000169045,0.007098,2.389750,104.571429,0.006643,2.526429,98.371429,0.005408,2.419286,89.453571,4374,72,2833,0
1,ENST00000374902,2482,CTGACAG,ENSG00000136891,0.007560,3.764643,106.964286,0.005346,8.643929,113.821429,0.007909,2.611071,80.214286,2968,134,1861,0
2,ENST00000217026,1738,CAGACCC,ENSG00000101057,0.009036,8.260000,108.836538,0.011480,5.427885,128.096154,0.005846,3.924615,79.675000,151,89,632,0
3,ENST00000394803,1321,AGAACAG,ENSG00000109332,0.008855,7.054000,128.133333,0.008535,5.120667,95.853333,0.006006,3.055000,88.960000,3364,26,953,0
4,ENST00000249014,598,GGGACAC,ENSG00000128283,0.009415,4.048154,117.128205,0.009710,7.149872,117.666667,0.007828,3.196282,82.721795,474,181,1534,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000552551,1370,GGAACAT,ENSG00000170421,0.007304,8.752857,119.823954,0.008027,3.603102,97.095815,0.006014,2.908407,87.478066,4698,171,2887,0
121834,ENST00000621914,5135,GAAACAA,ENSG00000006125,0.009418,3.244868,104.998684,0.007654,3.598947,98.059211,0.005389,3.381447,85.511842,5244,144,31,0
121835,ENST00000511450,415,TTAACTC,ENSG00000174695,0.007323,1.770944,93.327778,0.006851,1.988611,98.191667,0.007017,1.543528,92.108333,4358,273,3018,0
121836,ENST00000158771,1013,GAAACAA,ENSG00000072849,0.011057,3.060000,105.620000,0.006312,2.690667,97.173333,0.006495,2.360167,89.703333,38,144,295,0


In [28]:
# Split into train and temporary sets (70% train, 30% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Now split the temporary set into validation and test sets (50% val, 50% test)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [29]:
# Combine features and labels back for exporting
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [30]:
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

In [31]:
X_train

Unnamed: 0,ENST_ID,Position,Key,gene_id,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9,ENST_ID_encoded,Key_encoded,gene_id_encoded
39690,ENST00000392711,1312,GAAACAT,ENSG00000108946,0.011300,2.925200,105.840000,0.006688,3.947200,99.800000,0.006549,3.206800,88.612000,3286,147,941
32937,ENST00000356708,1133,TAAACAA,ENSG00000165629,0.008892,2.330544,103.184937,0.007565,2.887322,97.920502,0.006400,2.380251,87.730126,2341,216,2659
47579,ENST00000396705,849,GTAACTG,ENSG00000111669,0.007178,3.808339,90.662609,0.006693,3.910683,97.275652,0.008601,3.462987,92.266087,3447,202,1019
58927,ENST00000535149,2371,GAAACTG,ENSG00000147140,0.010850,3.717955,106.727273,0.008118,3.794091,105.302273,0.007403,3.753864,90.725000,4564,154,2192
96928,ENST00000237530,297,GGGACCT,ENSG00000118705,0.010300,3.695294,118.588235,0.007454,9.200196,117.960784,0.006610,3.950980,78.625490,360,187,1264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,ENST00000397977,612,CAGACCA,ENSG00000104613,0.007982,4.961053,109.315789,0.009501,6.775526,123.421053,0.007588,3.980684,75.481579,3488,88,745
119879,ENST00000575882,1449,TGAACTT,ENSG00000141198,0.006239,8.497273,118.000000,0.005295,3.088182,101.854545,0.004758,2.284545,92.413636,4838,251,2001
103694,ENST00000584667,1269,AGGACTG,ENSG00000161526,0.009380,4.903902,117.097561,0.008890,4.656585,119.414634,0.006984,2.984634,86.614634,4892,46,2487
860,ENST00000262982,150,CAAACAC,ENSG00000124207,0.008705,2.543131,103.268687,0.007830,2.772071,97.905051,0.007928,2.216162,89.635354,780,73,1403


In [32]:

X_train = train_data.drop(columns=['label','ENST_ID', 'Key', 'gene_id'])
y_train = train_data['label']

X_val = val_data.drop(columns=['label','ENST_ID', 'Key', 'gene_id'])
y_val = val_data['label']

X_test = test_data.drop(columns=['label','ENST_ID', 'Key', 'gene_id'])
y_test = test_data['label']

In [33]:
X_train

Unnamed: 0,Position,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9,ENST_ID_encoded,Key_encoded,gene_id_encoded
39690,1312,0.011300,2.925200,105.840000,0.006688,3.947200,99.800000,0.006549,3.206800,88.612000,3286,147,941
32937,1133,0.008892,2.330544,103.184937,0.007565,2.887322,97.920502,0.006400,2.380251,87.730126,2341,216,2659
47579,849,0.007178,3.808339,90.662609,0.006693,3.910683,97.275652,0.008601,3.462987,92.266087,3447,202,1019
58927,2371,0.010850,3.717955,106.727273,0.008118,3.794091,105.302273,0.007403,3.753864,90.725000,4564,154,2192
96928,297,0.010300,3.695294,118.588235,0.007454,9.200196,117.960784,0.006610,3.950980,78.625490,360,187,1264
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,612,0.007982,4.961053,109.315789,0.009501,6.775526,123.421053,0.007588,3.980684,75.481579,3488,88,745
119879,1449,0.006239,8.497273,118.000000,0.005295,3.088182,101.854545,0.004758,2.284545,92.413636,4838,251,2001
103694,1269,0.009380,4.903902,117.097561,0.008890,4.656585,119.414634,0.006984,2.984634,86.614634,4892,46,2487
860,150,0.008705,2.543131,103.268687,0.007830,2.772071,97.905051,0.007928,2.216162,89.635354,780,73,1403


In [34]:
#training
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [35]:
val_probabilities = model.predict_proba(X_val)[:, 1]

val_predictions = model.predict(X_val)

val_roc_auc = roc_auc_score(y_val, val_probabilities)

val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.68
Validation Accuracy Score: 0.96


Could be due to class imbalance

In [36]:
test_predictions = model.predict(X_test)
test_probabilities = model.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
test_roc_auc = roc_auc_score(y_test, test_probabilities)
print(f"Test ROC AUC: {test_roc_auc}")

Test Accuracy: 0.9535456336178595
Test ROC AUC: 0.675678717136258


In [37]:
clean_df.to_csv('processed_data.csv', index=False)