This notebook engineers features, fills missing values, and provides a way to impute missing values of features using intraclass modes.

In [1]:
LOCAL = True
data_fpath = '../data/raw/' if LOCAL else '/kaggle/input/protein-localization/'
out_fpath = '../data/intermediate/' if LOCAL else ''

In [2]:
%load_ext autoreload
%autoreload 2
import data_tools

In [3]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import seaborn as sns
from random import choice

In [4]:
from imblearn.over_sampling import RandomOverSampler

# Data Types
There are a few main “types” of features available, listed here:
* ESSENTIAL
* CLASS
* COMPLEX
* PHENOTYPE
* MOTIF
* Chromosome
* NUM INTERACTING WITH FUNCTION (int)
* INTERACTING PROTEIN type
* INTERACTING PROTEIN corr (float)
* Function
* Localization

Pretty much all are categorical except the last one.

In [5]:
field_descriptions_fpath = data_tools.field_descriptions_fpath
fields = data_tools.parse_field_descriptions(field_descriptions_fpath)
fields[[0]].head()

Unnamed: 0,0
0,protein
1,essential
2,class actin related proteins
3,class actins
4,class adaptins


In [6]:
# Datatype Specification
def feat_dtype(col_num, ints, floats):
    if col_num in ints:
        return 'int'
    if col_num in floats:
        return 'float'
    # We assume everything else is categorical
    return 'category'

float_indices = fields[0].str.contains("interacting protein") & fields[0].str.contains("corr")
int_indices = fields[0].str.contains("num interacting")

### 444 := chromosome #, coerce to float, so we fill missing, then turn into category later
float_feats = set(fields[[0]][float_indices].index) - {0, 2960} | {444}
int_feats = set(fields[[0]][int_indices].index) - {0, 2960}

dtypes = {col_num : feat_dtype(col_num, int_feats, float_feats) for col_num in range(1,2961)}

Saving the data type dictionary so we can load it later when loading the DataFrame.

In [7]:
# Save the data type dictionary to pickle
with open(f'{out_fpath}data_types_dict.pkl', 'wb') as handle:
    pickle.dump(dtypes, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
with open(f'{out_fpath}data_types_dict.pkl', 'rb') as handle:
    dtypes = pickle.load(handle)

## Load Dataframes

### Load Training DataFrame

In [9]:
df = pd.read_csv(f"{data_fpath}train.csv", header=None)
df = df.replace("?", np.nan)  # Replace ? mark with NaN
df = df.astype(dtypes)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [10]:
testdf = pd.read_csv(f"{data_fpath}test.csv", header=None)
testdf = testdf.replace("?", np.nan)  # Replace ? mark with NaN
dtypes.pop(2960, None)  # Pop target from data types
testdf = testdf.astype(dtypes)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Missing Value Analysis

In [11]:
# Missing Values
df[df.isna().any()[lambda x: x].index].isna().sum()

1        44
444       1
534       4
566       1
674       1
       ... 
2940    862
2941    862
2942    862
2943    862
2944    862
Length: 815, dtype: int64

In [12]:
# Missing Values
testdf[testdf.isna().any()[lambda x: x].index].isna().sum()

1        17
444      59
463       1
475       1
483       1
       ... 
2955    381
2956    381
2957    381
2958    381
2959    381
Length: 158, dtype: int64

It seems that for column 1 and 444 which are fairly important features, the number of missing values in training data is fairly small. It is worth imputing them with the most common value in their class so we can use SMOTE later.

In [13]:
def impute_by_class_mode(df, col, target_col=2960):
    '''Imputes a column with missing values by using
    the mode of that feature within the class.
    df : full dataframe with target_col
    col : column to impute
    '''
    # Create a map of class to mode of feature in col
    mode_map = df.loc[:, [col, target_col]].groupby(target_col)[col].agg(pd.Series.mode).to_dict()
    # Make all values lists in case ties
    mode_map = {k : np.asarray(v).tolist() for k, v in mode_map.items()}
    mode_map = {k : [v] if not isinstance(v, list) else v for k, v in mode_map.items()}
    # Make copy of column to impute
    col_to_impute = df[col].copy()
    # Identify rows with missing values
    missing_idxs = col_to_impute.isna()[lambda x: x].index
    col_to_impute.iloc[missing_idxs] = df.iloc[missing_idxs, target_col].apply(lambda x: choice(mode_map[x]))
    return col_to_impute


In [14]:
df[1] = impute_by_class_mode(df, 1)
df[444] = impute_by_class_mode(df, 444)

In [15]:
# Missing Values
df[df.isna().any()[lambda x: x].index].isna().sum()

534       4
566       1
674       1
720       9
732       1
       ... 
2940    862
2941    862
2942    862
2943    862
2944    862
Length: 813, dtype: int64

## Protein Interactions File

In [16]:
interaction_colnames = ['protein1', 'protein2', 'type', 'strength']
df2 = pd.read_csv(f"{data_fpath}protein_interactions.csv", header=None, names=interaction_colnames,
    dtype={
        'type' : 'category',
    }
)
df2.shape

(910, 4)

In [17]:
# The strength values have a random period at
# the end of the values preventing it from being parsed as numeric
df2['strength'] = df2['strength'].str.rstrip('.').replace("?", np.nan)
df2['strength'] = pd.to_numeric(df2['strength'].str.rstrip('.'))

In [18]:
df2.head()

Unnamed: 0,protein1,protein2,type,strength
0,P238510,P239467,Genetic,0.252653
1,P235550,P239467,Physical,0.709248
2,P235621,P239467,Physical,-0.001239
3,P235265,P239467,Physical,0.482255
4,P234935,P234445,Physical,-0.460856


## Protein Interactions Features

In [19]:
# ppi := protein-protein interactions
qry = fields[0].str.contains("interacting protein")
ppi_features = set(fields[[0]][qry].index) - {0}
qry_corr = fields[0].str.contains("corr")
qry_type = fields[0].str.contains("type")

ppi_corr_features = set(fields[[0]][qry & qry_corr].index) - {0}
ppi_type_features = set(fields[[0]][qry & qry_type].index) - {0}
[len(x) for x in (ppi_features, ppi_corr_features, ppi_type_features)]

[2486, 1243, 1243]

In [20]:
def corr_feat(protein : str):
    '''E.g.: interacting protein p235094 corr'''
    return f"interacting protein {protein.lower()} corr"

def type_feat(protein : str):
    '''E.g.: interacting protein p235094 corr'''
    return f"interacting protein {protein.lower()} type"

In [21]:
data_tools.feature_name(fields, 445)  # Example interaction feature

'num interacting with function "cell growth'

In [22]:
# Map from feature name to column
feat_to_col = data_tools.feat_to_col_map(data_tools.field_descriptions_fpath)

In [23]:
feat_to_col[corr_feat('P238510')]

830

In [24]:
data_tools.feature_name(fields, 2940)

'interacting protein p235082 corr'

Need a way to fill the protein interaction cells with the type.

This protein is not present in the dataset at all! So how can we use the PPI features? Possible features to engineer:
* Sum/min/max/mean/#negof interactions corr
* mode of interactions
* percent of interactions that are genetic
* meta feature: mode of the CLASS of proteins that interact (data leakage)

In [25]:
# Derive Features from PPI Correlation
df['interaction_sum'] = df.loc[:, ppi_corr_features].sum(axis=1)
df['interaction_mean'] = df.loc[:, ppi_corr_features].mean(axis=1)
df['interaction_max'] = df.loc[:, ppi_corr_features].max(axis=1)
df['interaction_max2'] = df.loc[:, ppi_corr_features].apply(lambda row: row.nlargest(2).values[-1], axis=1)
df['interaction_max3'] = df.loc[:, ppi_corr_features].apply(lambda row: row.nlargest(3).values[-1], axis=1)
df['interaction_min'] = df.loc[:, ppi_corr_features].min(axis=1)
df['interaction_neg'] = df.loc[:, ppi_corr_features].lt(0).sum(axis=1)
df['interaction_count'] = (df.loc[:, ppi_corr_features] != 0).sum(axis=1)
df['interaction_count'] = df['interaction_count'] - min(df['interaction_count'])
df['interaction_pos'] = df.loc[:, ppi_corr_features].gt(0).sum(axis=1)
df['interaction_std'] = df.loc[:, ppi_corr_features].std(axis=1)
df['interaction_skew'] = df.loc[:, ppi_corr_features].apply(
    lambda row: sp.stats.skew(row, nan_policy='omit'), axis=1).astype(float)


In [26]:
# Derive Features from PPI Type
df['ppi_genetic_count'] = df.loc[:, ppi_type_features].apply(
    lambda row: row.astype(str).str.count("Genetic").sum(), axis=1)
df['ppi_physical_count'] = df.loc[:, ppi_type_features].apply(
    lambda row: row.astype(str).str.count("Physical").sum(), axis=1)
df['ppi_gen_phys_count'] = df.loc[:, ppi_type_features].apply(
    lambda row: row.astype(str).str.count("Genetic-Physical").sum(), axis=1)
df['ppi_genetic_physical_ratio'] = df['ppi_genetic_count'] / (1 + df['ppi_physical_count'])
ppi_type_feats = ['ppi_genetic_count', 'ppi_physical_count', 'ppi_gen_phys_count']
df['ppi_dom_type'] = df[ppi_type_feats].idxmax(axis='columns').astype('category')

In [27]:
# Get proteins of highest correlation, since that is the point of this dataset
# Note that if all ties, we return the first index
df['max_corr_ppi'] = df[ppi_corr_features].idxmax(axis='columns').astype('category')
df['min_corr_ppi'] = df[ppi_corr_features].idxmax(axis='columns').astype('category')

### Function Features

In [28]:
function_feats_qry = fields[0].str.contains("function")
function_feats = fields[[0]][function_feats_qry].index
function_feats = function_feats[function_feats > 2900]  # Interested in the ones that are missing from test_data
function_feats

Int64Index([2945, 2946, 2947, 2948, 2949, 2950, 2951, 2952, 2953, 2954, 2955,
            2956, 2957, 2958],
           dtype='int64')

In [29]:
testdf[function_feats].replace('?', np.nan).sum().sum()

0.0

These features are all missing in test data! Best to drop them from training data unless we can fill it.

### Pipeline Altogether

In [30]:
def data_pipeline(df, fields, dtypes, impute=False, target_col=None, seed=None):
    '''Takes a DataFrame and returns features to pass into model.'''
    # Handle Missing Values
    # TODO: Try using zeros
    # LightGBM should handle NasNs though
    df = df.replace("?", np.nan)  # Replace ? mark with NaN
    
    # Convert to correct data types
    if target_col is None:
        dtypes.pop(2960)  # labels aren't in test data
    df = df.astype(dtypes)
    
    # Impute Missing Values
    # I've selected these columns very carefully
    impute_cols = [1, 444]
    for col in impute_cols:
        if impute:
            # Impute the column
            df[col] = impute_by_class_mode(df, col)
        # Convert back to categorical
        df[col] = df[col].astype('category')
    
    # Identify columns corresponding to PPI features
    # PPI := protein-protein interactions
    qry = fields[0].str.contains("interacting protein")
    ppi_features = set(fields[[0]][qry].index) - {0}
    qry_corr = fields[0].str.contains("corr")
    qry_type = fields[0].str.contains("type")

    ppi_corr_features = set(fields[[0]][qry & qry_corr].index) - {0}
    ppi_type_features = set(fields[[0]][qry & qry_type].index) - {0}
    
    # Derive Features from PPI Correlation
    df['interaction_sum'] = df.loc[:, ppi_corr_features].sum(axis=1)
    df['interaction_mean'] = df.loc[:, ppi_corr_features].mean(axis=1)
    df['interaction_max'] = df.loc[:, ppi_corr_features].max(axis=1)
    df['interaction_max2'] = df.loc[:, ppi_corr_features].apply(lambda row: row.nlargest(2).values[-1], axis=1)
    df['interaction_max3'] = df.loc[:, ppi_corr_features].apply(lambda row: row.nlargest(3).values[-1], axis=1)
    df['interaction_min'] = df.loc[:, ppi_corr_features].min(axis=1)
    df['interaction_neg'] = df.loc[:, ppi_corr_features].lt(0).sum(axis=1)
    df['interaction_count'] = (df.loc[:, ppi_corr_features] != 0).sum(axis=1)
    df['interaction_count'] = df['interaction_count'] - min(df['interaction_count'])
    df['interaction_pos'] = df.loc[:, ppi_corr_features].gt(0).sum(axis=1)
    df['interaction_std'] = df.loc[:, ppi_corr_features].std(axis=1)
    df['interaction_skew'] = df.loc[:, ppi_corr_features].apply(
        lambda row: sp.stats.skew(row, nan_policy='omit'), axis=1).astype(float)

    # Derive Features from PPI Type
    df['ppi_genetic_count'] = df.loc[:, ppi_type_features].apply(
        lambda row: row.astype(str).str.count("Genetic").sum(), axis=1)
    df['ppi_physical_count'] = df.loc[:, ppi_type_features].apply(
        lambda row: row.astype(str).str.count("Physical").sum(), axis=1)
    df['ppi_gen_phys_count'] = df.loc[:, ppi_type_features].apply(
        lambda row: row.astype(str).str.count("Genetic-Physical").sum(), axis=1)
    df['ppi_genetic_physical_ratio'] = df['ppi_genetic_count'] / (1 + df['ppi_physical_count'])
    ppi_type_feats = ['ppi_genetic_count', 'ppi_physical_count', 'ppi_gen_phys_count']
    df['ppi_dom_type'] = df[ppi_type_feats].idxmax(axis='columns').astype('category')
    
    # Get proteins of highest correlation, since that is the point of this dataset
    # Note that if all ties, we return the first index
    df['max_corr_ppi'] = df[ppi_corr_features].idxmax(axis='columns').astype('category')
    df['min_corr_ppi'] = df[ppi_corr_features].idxmax(axis='columns').astype('category')
    
    # Drop Function Features
    function_feats_qry = fields[0].str.contains("function")
    function_feats = fields[[0]][function_feats_qry].index
    function_feats = set(function_feats[function_feats > 2900])
    
    # Use only selected features
    X = df[set(df.columns) - {target_col, 0, 2959} - ppi_features - function_feats]
    if target_col is not None:
        y = df[target_col]
    
    # Return Datasets
    if target_col is not None:
        return X, y
    return X

### Apply Pipeline to Training Data

In [31]:
assert True  # Stop running notebook here

In [32]:
df = pd.read_csv(f"{data_fpath}train.csv", header=None)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [33]:
with open(f'{out_fpath}data_types_dict.pkl', 'rb') as handle:
    dtypes = pickle.load(handle)

In [34]:
X, y = data_pipeline(df, fields, dtypes, impute=True, target_col=2960, seed=42)

In [35]:
X.isna().any().any()

False

In [36]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 862 entries, 0 to 861
Columns: 476 entries, 1 to interaction_max3
dtypes: category(447), float64(9), int64(20)
memory usage: 649.0 KB


### Apply Pipeline to Test Data

In [37]:
testdf = pd.read_csv(f"{data_fpath}test.csv", header=None)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [38]:
with open(f'{out_fpath}data_types_dict.pkl', 'rb') as handle:
    dtypes = pickle.load(handle)

In [39]:
X_kaggle = data_pipeline(testdf, fields, dtypes, target_col=None, seed=42)

In [40]:
X_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Columns: 476 entries, 1 to interaction_max3
dtypes: category(447), float64(9), int64(20)
memory usage: 325.9 KB


# Saving output

In [41]:
X.to_pickle(f"{out_fpath}X.pkl")
y.to_pickle(f"{out_fpath}y.pkl")

In [42]:
X_kaggle.to_pickle(f"{out_fpath}X_kaggle.pkl")