In [1]:
LOCAL = True
data_fpath = '../data/raw/' if LOCAL else '/kaggle/input/protein-localization/'
out_fpath = '../data/intermediate/' if LOCAL else ''

In [2]:
%load_ext autoreload
%autoreload 2
import data_tools

In [3]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
import seaborn as sns

In [4]:
from imblearn.over_sampling import RandomOverSampler

# Data Types
There are a few main “types” of features available, listed here:
* ESSENTIAL
* CLASS
* COMPLEX
* PHENOTYPE
* MOTIF
* Chromosome
* NUM INTERACTING WITH FUNCTION (int)
* INTERACTING PROTEIN type
* INTERACTING PROTEIN corr (float)
* Function
* Localization

Pretty much all are categorical except the last one.

In [5]:
field_descriptions_fpath = data_tools.field_descriptions_fpath
fields = data_tools.parse_field_descriptions(field_descriptions_fpath)
fields[[0]].head()

Unnamed: 0,0
0,protein
1,essential
2,class actin related proteins
3,class actins
4,class adaptins


In [6]:
# Datatype Specification
def feat_dtype(col_num, ints, floats):
    if col_num in ints:
        return 'int'
    if col_num in floats:
        return 'float'
    # We assume everything else is categorical
    return 'category'

float_indices = fields[0].str.contains("interacting protein") & fields[0].str.contains("corr")
int_indices = fields[0].str.contains("num interacting")

float_feats = set(fields[[0]][float_indices].index) - {0, 2960}
int_feats = set(fields[[0]][int_indices].index) - {0, 2960}

dtypes = {col_num : feat_dtype(col_num, int_feats, float_feats) for col_num in range(1,2961)}

In [7]:
# Save the data type dictionary to pickle
with open(f'{out_fpath}data_types_dict.pkl', 'wb') as handle:
    pickle.dump(dtypes, handle, protocol=pickle.HIGHEST_PROTOCOL)

Saving the data type dictionary so we can load it later when loading the DataFrame.

## Load Dataframes

### Load Training DataFrame

In [8]:
df = pd.read_csv(f"{data_fpath}train.csv", header=None)
df = df.replace("?", np.nan)  # Replace ? mark with NaN
df = df.astype(dtypes)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [9]:
with open(f'{out_fpath}data_types_dict.pkl', 'rb') as handle:
    dtypes = pickle.load(handle)

## Protein Interactions File

In [10]:
interaction_colnames = ['protein1', 'protein2', 'type', 'strength']
df2 = pd.read_csv(f"{data_fpath}protein_interactions.csv", header=None, names=interaction_colnames,
    dtype={
        'type' : 'category',
    }
)
df2.shape

(910, 4)

In [11]:
# The strength values have a random period at
# the end of the values preventing it from being parsed as numeric
df2['strength'] = df2['strength'].str.rstrip('.').replace("?", np.nan)
df2['strength'] = pd.to_numeric(df2['strength'].str.rstrip('.'))

In [12]:
df2.head()

Unnamed: 0,protein1,protein2,type,strength
0,P238510,P239467,Genetic,0.252653
1,P235550,P239467,Physical,0.709248
2,P235621,P239467,Physical,-0.001239
3,P235265,P239467,Physical,0.482255
4,P234935,P234445,Physical,-0.460856


## Protein Interactions Features

In [13]:
# ppi := protein-protein interactions
qry = fields[0].str.contains("interacting protein")
ppi_features = set(fields[[0]][qry].index) - {0}
qry_corr = fields[0].str.contains("corr")
qry_type = fields[0].str.contains("type")

ppi_corr_features = set(fields[[0]][qry & qry_corr].index) - {0}
ppi_type_features = set(fields[[0]][qry & qry_type].index) - {0}
[len(x) for x in (ppi_features, ppi_corr_features, ppi_type_features)]

[2486, 1243, 1243]

In [14]:
def corr_feat(protein : str):
    '''E.g.: interacting protein p235094 corr'''
    return f"interacting protein {protein.lower()} corr"

def type_feat(protein : str):
    '''E.g.: interacting protein p235094 corr'''
    return f"interacting protein {protein.lower()} type"

In [15]:
data_tools.feature_name(fields, 465)  # Example interaction feature

'interacting protein p234430 type'

In [16]:
# Map from feature name to column
feat_to_col = data_tools.feat_to_col_map(data_tools.field_descriptions_fpath)

In [17]:
feat_to_col[corr_feat('p235094')]

2936

Need a way to fill the protein interaction cells with the type.

This protein is not present in the dataset at all! So how can we use the PPI features? Possible features to engineer:
* Sum/min/max/mean/#negof interactions corr
* mode of interactions
* percent of interactions that are genetic
* meta feature: mode of the CLASS of proteins that interact (data leakage)

In [20]:
# Derive Features from PPI Correlation
df['interaction_sum'] = df.loc[:, ppi_corr_features].sum(axis=1)
df['interaction_mean'] = df.loc[:, ppi_corr_features].mean(axis=1)
df['interaction_max'] = df.loc[:, ppi_corr_features].max(axis=1)
df['interaction_min'] = df.loc[:, ppi_corr_features].min(axis=1)
df['interaction_neg'] = df.loc[:, ppi_corr_features].lt(0).sum(axis=1)
df['interaction_zero'] = (df.loc[:, ppi_corr_features] == 0).sum(axis=1)
df['interaction_pos'] = df.loc[:, ppi_corr_features].gt(0).sum(axis=1)

In [22]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2958,2959,2960,interaction_sum,interaction_mean,interaction_max,interaction_min,interaction_neg,interaction_zero,interaction_pos
0,P239476,Non-Essential,No,No,No,No,No,No,No,No,...,No,nucleus,0,0.0,0.0,0.0,0.0,0,862,0
1,P234427,Non-Essential,No,No,No,No,No,No,No,No,...,No,nucleus,0,0.0,0.0,0.0,0.0,0,862,0
2,P234429,Non-Essential,No,No,No,Yes,No,No,No,No,...,No,nucleus,0,0.0,0.0,0.0,0.0,0,862,0
3,P234430,Non-Essential,No,No,No,No,No,No,No,No,...,No,nucleus,0,0.58263,0.000676,0.646193,-0.063563,1,860,1
4,P239467,Essential,No,No,No,No,No,No,No,No,...,No,nucleus,0,1.252315,0.001453,0.723051,-0.483302,3,855,4


### Pipeline Altogether

In [23]:
def data_pipeline(df, fields, dtypes, target_col=None, seed=None):
    '''Takes a DataFrame and returns features to pass into model.'''
    # Handle Missing Values
    # TODO: Try using zeros
    # LightGBM should handle NasNs though
    df = df.replace("?", np.nan)  # Replace ? mark with NaN
    
    # Convert to correct data types
    if target_col is None:
        dtypes.pop(2960)  # labels aren't in test data
    df = df.astype(dtypes)
    
    # Identify columns corresponding to PPI features
    # PPI := protein-protein interactions
    qry = fields[0].str.contains("interacting protein")
    ppi_features = set(fields[[0]][qry].index) - {0}
    qry_corr = fields[0].str.contains("corr")
    qry_type = fields[0].str.contains("type")

    ppi_corr_features = set(fields[[0]][qry & qry_corr].index) - {0}
    ppi_type_features = set(fields[[0]][qry & qry_type].index) - {0}
    
    # Derive Features from PPI Correlation
    df['interaction_sum'] = df.loc[:, ppi_corr_features].sum(axis=1)
    df['interaction_mean'] = df.loc[:, ppi_corr_features].mean(axis=1)
    df['interaction_max'] = df.loc[:, ppi_corr_features].max(axis=1)
    df['interaction_min'] = df.loc[:, ppi_corr_features].min(axis=1)
    df['interaction_neg'] = df.loc[:, ppi_corr_features].lt(0).sum(axis=1)
    df['interaction_zero'] = (df.loc[:, ppi_corr_features] == 0).sum(axis=1)
    df['interaction_pos'] = df.loc[:, ppi_corr_features].gt(0).sum(axis=1)
    
    # Derive Features from PPI Type
    
    # Use only selected features
    X = df[set(df.columns) - {target_col, 0} - ppi_features]
    if target_col is not None:
        y = df[target_col]
    
x     # Return Datasets
    if target_col is not None:
        return X, y
    return X

### Apply Pipeline to Training Data

In [24]:
df = pd.read_csv(f"{data_fpath}train.csv", header=None)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [25]:
with open(f'{out_fpath}data_types_dict.pkl', 'rb') as handle:
    dtypes = pickle.load(handle)

In [26]:
X, y = data_pipeline(df, fields, dtypes, target_col=2960, seed=42)

In [27]:
X.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,2953,2954,2955,2956,2957,2958,2959,interaction_mean,interaction_pos,interaction_sum
0,Non-Essential,No,No,No,No,No,No,No,No,No,...,Yes,No,No,Yes,No,No,nucleus,0.0,0,0.0
1,Non-Essential,No,No,No,No,No,No,No,No,No,...,No,No,No,Yes,No,No,nucleus,0.0,0,0.0
2,Non-Essential,No,No,No,Yes,No,No,No,No,No,...,No,No,No,No,No,No,nucleus,0.0,0,0.0
3,Non-Essential,No,No,No,No,No,No,No,No,No,...,No,No,No,Yes,No,No,nucleus,0.000676,1,0.58263
4,Essential,No,No,No,No,No,No,No,No,No,...,No,No,No,Yes,No,No,nucleus,0.001453,4,1.252315


### Apply Pipeline to Test Data

In [28]:
testdf = pd.read_csv(f"{data_fpath}test.csv", header=None)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [29]:
with open(f'{out_fpath}data_types_dict.pkl', 'rb') as handle:
    dtypes = pickle.load(handle)

In [30]:
X_kaggle = data_pipeline(testdf, fields, dtypes, target_col=None, seed=42)

In [31]:
X_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Columns: 480 entries, 1 to interaction_sum
dtypes: category(459), float64(4), int64(17)
memory usage: 287.3 KB


# Saving output

In [32]:
X.to_pickle(f"{out_fpath}X.pkl")
y.to_pickle(f"{out_fpath}y.pkl")

In [33]:
X_kaggle.to_pickle(f"{out_fpath}X_kaggle.pkl")

In [34]:
testdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2950,2951,2952,2953,2954,2955,2956,2957,2958,2959
0,P234273,Essential,No,No,No,Yes,No,No,No,No,...,?,?,?,?,?,?,?,?,?,?
1,P234490,Non-Essential,No,No,No,No,No,No,No,No,...,?,?,?,?,?,?,?,?,?,?
2,P236380,?,No,No,No,No,No,No,No,No,...,?,?,?,?,?,?,?,?,?,?
3,P234394,Essential,No,Yes,No,No,No,No,No,No,...,?,?,?,?,?,?,?,?,?,?
4,P234385,Non-Essential,No,No,No,No,No,No,No,No,...,?,?,?,?,?,?,?,?,?,?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,P235517,Essential,No,No,No,No,No,No,No,No,...,?,?,?,?,?,?,?,?,?,?
377,P239167,Non-Essential,No,No,No,No,No,No,No,No,...,?,?,?,?,?,?,?,?,?,?
378,P240327,Non-Essential,No,No,No,No,No,No,No,No,...,?,?,?,?,?,?,?,?,?,?
379,P235928,Essential,No,No,No,No,No,No,No,No,...,?,?,?,?,?,?,?,?,?,?
