In [1]:
import numpy as np
import pandas as pd

In [2]:
train_features = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')

In [4]:
GENES = [g for g in train_features.columns if g.startswith("g-")]
# print(f"Number of gene features: {len(GENES)}")
CELLS = [c for c in train_features.columns if c.startswith("c-")]
# print(f"Number of cell features: {len(CELLS)}")

In [3]:
# Feature Engineering - Important Gene features, create new by squaring them
gsquarecols=['g-574','g-211','g-216','g-0','g-255','g-577','g-153','g-389','g-60','g-370','g-248','g-167','g-203','g-177','g-301','g-332','g-517','g-6','g-744','g-224','g-162','g-3','g-736','g-486','g-283','g-22','g-359','g-361','g-440','g-335','g-106','g-307','g-745','g-146','g-416','g-298','g-666','g-91','g-17','g-549','g-145','g-157','g-768','g-568','g-396']

In [5]:
def fe_stats(train, test):
    
    # statistical Features
    
    features_g = GENES
    features_c = CELLS
    
    for df in train, test:
        df['g_sum'] = df[features_g].sum(axis = 1)
        df['g_mean'] = df[features_g].mean(axis = 1)
        df['g_std'] = df[features_g].std(axis = 1)
        df['g_kurt'] = df[features_g].kurtosis(axis = 1)
        df['g_skew'] = df[features_g].skew(axis = 1)
        df['c_sum'] = df[features_c].sum(axis = 1)
        df['c_mean'] = df[features_c].mean(axis = 1)
        df['c_std'] = df[features_c].std(axis = 1)
        df['c_kurt'] = df[features_c].kurtosis(axis = 1)
        df['c_skew'] = df[features_c].skew(axis = 1)
        df['gc_sum'] = df[features_g + features_c].sum(axis = 1)
        df['gc_mean'] = df[features_g + features_c].mean(axis = 1)
        df['gc_std'] = df[features_g + features_c].std(axis = 1)
        df['gc_kurt'] = df[features_g + features_c].kurtosis(axis = 1)
        df['gc_skew'] = df[features_g + features_c].skew(axis = 1)
        
        # New Features comprised of strongly correltated cells, average of
        df['c62_c42'] = (df['c-62'] + df['c-42']) /2
        df['c90_c55'] = (df['c-90'] + df['c-55']) /2
        df['c26_c38'] = (df['c-26'] + df['c-38']) /2
        df['c63_c42'] = (df['c-63'] + df['c-42']) /2
        df['c26_c13'] = (df['c-26'] + df['c-13']) /2
        df['c33_c6'] = (df['c-33'] + df['c-6']) /2
        df['c11_c55'] = (df['c-11'] + df['c-55']) /2
        df['c55_c4'] = (df['c-55'] + df['c-4']) /2
        df['c4_c13'] = (df['c-4'] + df['c-13']) /2
        df['c82_c42'] = (df['c-82'] + df['c-42']) /2
        df['c66_c42'] = (df['c-66'] + df['c-42']) /2
        df['c6_c38'] = (df['c-6'] + df['c-38']) /2
        df['c2_c13'] = (df['c-2'] + df['c-13']) /2
        df['c94_c11'] = (df['c-94'] + df['c-11']) /2
        df['c94_c60'] = (df['c-94'] + df['c-60']) /2
        df['c55_c42'] = (df['c-55'] + df['c-42']) /2
        df['c52_c42'] = (df['c-52'] + df['c-42']) /2
        df['c13_c73'] = (df['c-13'] + df['c-73']) /2
        df['c38_c63'] = (df['c-38'] + df['c-63']) /2
        df['c38_c94'] = (df['c-38'] + df['c-94']) /2
        df['c13_c94'] = (df['c-13'] + df['c-94']) /2
        df['c4_c52'] = (df['c-4'] + df['c-52']) /2
        df['c4_c42'] = (df['c-4'] + df['c-42']) /2
        df['c13_c38'] = (df['c-13'] + df['c-38']) /2
        df['c55_c2'] = (df['c-55'] + df['c-2']) /2
        df['c90_c13'] = (df['c-90'] + df['c-13']) /2
        df['c85_c31'] = (df['c-85'] + df['c-31']) /2
        
        #New Features Comprised of strongly correlated genes, average of
        df['g37_g50'] = (df['g-37'] + df['g-50']) /2
        df['g369_g-569'] = (df['g-369'] + df['g-569']) /2
        df['g349_g460'] = (df['g-469'] + df['g-349']) /2
        df['g50_g489'] = (df['g-50'] + df['g-489']) /2
        df['g37_g489'] = (df['g-37'] + df['g-489']) /2
        df['g63_g195'] = (df['g-63'] + df['g-195']) /2
        df['g121_g672'] = (df['g-121'] + df['g-672']) /2
        df['g37_g672'] = (df['g-37'] + df['g-672']) /2
        # Import Gene columns, new features from
        for feature in features_c:
             df[f'{feature}_squared'] = df[feature] ** 2     
                
        for feature in gsquarecols:
            df[f'{feature}_squared'] = df[feature] ** 2 
        
    return train, test

train_feature_new,test_feature_new=fe_stats(train_features,test_features)

In [6]:
train_feature_new.to_csv('train_feats_new.csv')
test_feature_new.to_csv('test_feats_new.csv')