In [1]:
!ls ../data/raw

field_descriptions.txt   protein_interactions.csv test.csv
label_legend.csv         sample_submission.csv    train.csv


In [2]:
%load_ext autoreload
%autoreload 2
import data_tools

In [129]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle

In [4]:
LOCAL = True
if LOCAL:
    fpath = "../data/raw/"

# Data Types
There are a few main “types” of features available, listed here:
* ESSENTIAL
* CLASS
* COMPLEX
* PHENOTYPE
* MOTIF
* Chromosome
* NUM INTERACTING WITH FUNCTION (int)
* INTERACTING PROTEIN type
* INTERACTING PROTEIN corr (float)
* Function
* Localization

Pretty much all are categorical except the last one.

In [96]:
field_descriptions_fpath = data_tools.field_descriptions_fpath
fields = data_tools.parse_field_descriptions(field_descriptions_fpath)
fields[[0]].head()

Unnamed: 0,0
0,protein
1,essential
2,class actin related proteins
3,class actins
4,class adaptins


In [155]:
# Datatype Specification
def feat_dtype(col_num, ints, floats):
    if col_num in ints:
        return 'int'
    if col_num in floats:
        return 'float'
    # We assume everything else is categorical
    return 'category'

float_indices = fields[0].str.contains("interacting protein") & fields[0].str.contains("corr")
int_indices = fields[0].str.contains("num interacting")

float_feats = set(fields[[0]][float_indices].index) - {0, 2960}
int_feats = set(fields[[0]][int_indices].index) - {0, 2960}

dtypes = {col_num : feat_dtype(col_num, int_feats, float_feats) for col_num in range(1,2961)}

In [157]:
# Save the data type dictionary to pickle
with open('../data/intermediate/data_types_dict.pkl', 'wb') as handle:
    pickle.dump(dtypes, handle, protocol=pickle.HIGHEST_PROTOCOL)

Saving the data type dictionary so we can load it later when loading the DataFrame.

## Load Training Dataframe

In [158]:
df = pd.read_csv(
    f"{fpath}train.csv",
    header=None,
)
# Since there are missing values, we'll have to fill the ? marks with NaN first

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [159]:
with open('../data/intermediate/data_types_dict.pkl', 'rb') as handle:
    dtypes = pickle.load(handle)

In [160]:
# Handle Missing Values
df = df.replace("?", np.nan)
# Convert to correct data types
df = df.astype(dtypes)
# Drop Label Column
# TODO: upsample minority classes

## High-level Info on Dataset

In [135]:
df.shape

(862, 2961)

In [136]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2951,2952,2953,2954,2955,2956,2957,2958,2959,2960
0,P239476,Non-Essential,No,No,No,No,No,No,No,No,...,No,No,Yes,No,No,Yes,No,No,nucleus,0
1,P234427,Non-Essential,No,No,No,No,No,No,No,No,...,No,No,No,No,No,Yes,No,No,nucleus,0
2,P234429,Non-Essential,No,No,No,Yes,No,No,No,No,...,No,No,No,No,No,No,No,No,nucleus,0
3,P234430,Non-Essential,No,No,No,No,No,No,No,No,...,No,Yes,No,No,No,Yes,No,No,nucleus,0
4,P239467,Essential,No,No,No,No,No,No,No,No,...,No,No,No,No,No,Yes,No,No,nucleus,0


In [137]:
df.describe()

Unnamed: 0,445,446,447,448,449,450,451,452,453,454,...,2928,2930,2932,2934,2936,2938,2940,2942,2944,2960
count,862.0,862.0,862.0,862.0,862.0,862.0,862.0,862.0,862.0,862.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,862.0
mean,0.758701,0.157773,0.132251,0.12529,1.976798,0.545244,0.054524,0.039443,0.287703,0.353828,...,,,,,,,,,,1.829466
std,1.360557,0.5084,0.440309,0.460355,2.260486,1.347659,0.281932,0.222589,0.70063,0.765913,...,,,,,,,,,,2.48665
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
25%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
50%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,1.0
75%,1.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,3.0
max,17.0,7.0,4.0,5.0,16.0,15.0,3.0,2.0,7.0,5.0,...,,,,,,,,,,14.0


In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 862 entries, 0 to 861
Columns: 2961 entries, 0 to 2960
dtypes: category(1702), float64(1243), int64(15), object(1)
memory usage: 9.9+ MB


In [138]:
df[2960].value_counts()

0     366
1     192
2      69
3      58
4      43
5      43
6      35
7      18
8      17
9      10
10      4
11      3
12      2
13      1
14      1
Name: 2960, dtype: int64

Big class imbalance here. May need some oversampling techniques (SMOTE, ADASYN).

## Protein Interactions File

In [140]:
interaction_colnames = ['protein1', 'protein2', 'type', 'strength']
df2 = pd.read_csv(
    "../data/raw/protein_interactions.csv",
    header=None,
    names=interaction_colnames,
    dtype={
        'type' : 'category',
    }
)
df2.shape

(910, 4)

In [141]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 910 entries, 0 to 909
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   protein1  910 non-null    object  
 1   protein2  910 non-null    object  
 2   type      910 non-null    category
 3   strength  910 non-null    object  
dtypes: category(1), object(3)
memory usage: 22.5+ KB


In [142]:
# The strength values have a random period at
# the end of the values preventing it from being parsed as numeric
df2['strength'] = df2['strength'].str.rstrip('.').replace("?", np.nan)
df2['strength'] = pd.to_numeric(df2['strength'].str.rstrip('.'))

In [143]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 910 entries, 0 to 909
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   protein1  910 non-null    object  
 1   protein2  910 non-null    object  
 2   type      910 non-null    category
 3   strength  871 non-null    float64 
dtypes: category(1), float64(1), object(2)
memory usage: 22.5+ KB


## Join with Protein Interactions

In [72]:
data_tools.feature_name(fields, 468)  # Example interaction feature

'interacting protein p239467 corr'

In [60]:
def corr_feat(protein : str):
    '''E.g.: interacting protein p235094 corr'''
    return f"interacting protein {protein.lower()} corr"

def type_feat(protein : str):
    '''E.g.: interacting protein p235094 corr'''
    return f"interacting protein {protein.lower()} type"

In [48]:
# Map from feature name to column
feat_to_col = data_tools.feat_to_col_map(data_tools.field_descriptions_fpath)

In [35]:
df[interaction_feats].head()

Unnamed: 0,459,460,461,462,463,464,465,466,467,468,...,2935,2936,2937,2938,2939,2940,2941,2942,2943,2944
0,No,0,No,0,No,0,No,0,No,0,...,,,,,,,,,,
1,No,0,No,0,No,0,No,0,No,0,...,,,,,,,,,,
2,No,0,No,0,No,0,No,0,No,0,...,,,,,,,,,,
3,No,0,No,0,No,0,No,0,No,0,...,,,,,,,,,,
4,No,0,No,0,No,0,No,0,No,0,...,,,,,,,,,,


In [144]:
feat_to_col[corr_feat('P239467')]

468

In [149]:
df.loc[df[468] != 0, [0, 467, 468]]

Unnamed: 0,0,467,468
138,P238704,Physical,0.723051
185,P238510,Genetic,0.252653
490,P235265,Physical,0.482255
665,P235550,Physical,0.709248
706,P235621,Physical,-0.001239
721,P235639,Physical,-0.483302
750,P235701,Genetic,-0.430352


In [151]:
df2.query('protein1 == "P239467" | protein2 == "P239467"')

Unnamed: 0,protein1,protein2,type,strength
0,P238510,P239467,Genetic,0.252653
1,P235550,P239467,Physical,0.709248
2,P235621,P239467,Physical,-0.001239
3,P235265,P239467,Physical,0.482255
110,P239467,P238704,Physical,0.723051
739,P239467,P235639,Physical,-0.483302
780,P239467,P235701,Genetic,-0.430352


Looks like all the data is already in the main dataframe. Not really sure what the point of this additional file is.

# Saving output

In [161]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 862 entries, 0 to 861
Columns: 2961 entries, 0 to 2960
dtypes: category(1703), float64(1243), int64(14), object(1)
memory usage: 9.9+ MB


In [162]:
df.to_pickle("../data/intermediate/data.pkl")