### Exploratory Data Analysis
---

#### Problem statement
---
##### > Finding protein subsets that distinguish between the classes is the goal.

In [1]:
INPUTDATA="/home/miki/Desktop/Docker/iNeuron/mice-protien-expression/data/raw/Data_Cortex_Nuclear.xlsx"

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Show plots in jupyter notebook
%matplotlib inline
# Set plot style
sns.set(color_codes=True)

In [3]:
def import_data(path: str = None, columns: list = None):
    """
    Function to import a dataset into 
    the working environment
    --------------------------------
    Parameter
    ----------
    path : str
    Usually a string with the path of the data
    --------------------------------
    Returns
    -------
    data : dataframe
    Returns a pandas dataframe
     """
    data = pd.read_excel(path)
    if columns:
        data = data.drop(columns, axis = 1)
    return data

In [4]:
col = ['Genotype', 'Treatment', 'Behavior']
df = import_data(INPUTDATA, col)
df.head(3)

  warn(msg)


Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,class
0,309_1,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,...,0.122652,,0.106305,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652,c-CS-m
1,309_2,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,...,0.116682,,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361,c-CS-m
2,309_3,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,...,0.118508,,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,c-CS-m


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 79 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MouseID          1095 non-null   object 
 1   DYRK1A_N         1092 non-null   float64
 2   ITSN1_N          1092 non-null   float64
 3   BDNF_N           1092 non-null   float64
 4   NR1_N            1092 non-null   float64
 5   NR2A_N           1092 non-null   float64
 6   pAKT_N           1092 non-null   float64
 7   pBRAF_N          1092 non-null   float64
 8   pCAMKII_N        1092 non-null   float64
 9   pCREB_N          1092 non-null   float64
 10  pELK_N           1092 non-null   float64
 11  pERK_N           1092 non-null   float64
 12  pJNK_N           1092 non-null   float64
 13  PKCA_N           1092 non-null   float64
 14  pMEK_N           1092 non-null   float64
 15  pNR1_N           1092 non-null   float64
 16  pNR2A_N          1092 non-null   float64
 17  pNR2B_N       

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DYRK1A_N,1092.0,0.429418,0.249792,0.145327,0.289128,0.368134,0.492861,2.516367
ITSN1_N,1092.0,0.619503,0.251025,0.245359,0.474159,0.568502,0.702111,2.602662
BDNF_N,1092.0,0.319422,0.049246,0.115181,0.288131,0.316923,0.348235,0.497160
NR1_N,1092.0,2.280357,0.373589,0.964543,2.050378,2.291166,2.526946,3.757641
NR2A_N,1092.0,3.822367,0.945597,1.711370,3.129370,3.722900,4.408332,8.482553
...,...,...,...,...,...,...,...,...
SYP_N,1080.0,0.446073,0.066432,0.258626,0.398082,0.448459,0.490773,0.759588
H3AcK18_N,900.0,0.169609,0.059402,0.079691,0.125848,0.158240,0.197876,0.479763
EGR1_N,870.0,0.183135,0.040406,0.105537,0.155121,0.174935,0.204542,0.360692
H3MeK4_N,810.0,0.205440,0.055514,0.101787,0.165143,0.193994,0.235215,0.413903


In [7]:
df.isnull().sum().T

MouseID        0
DYRK1A_N       3
ITSN1_N        3
BDNF_N         3
NR1_N          3
            ... 
H3AcK18_N    195
EGR1_N       225
H3MeK4_N     285
CaNA_N        15
class          0
Length: 79, dtype: int64

In [8]:

def null_test(data: pd.DataFrame=None):
    """ 
    A function to fill the Null values with
    the mean of our dataset
    ---------
    Parameters
    param: pandas dataframe
    ----------
    Returns
    a dataframe cleaned of missing values
    -----------
    """
    
    if data.isnull().any().any():
        data.fillna(df.median(), inplace=True)
    else:
        print("No missing values encountered")  
    return data

In [9]:
df = null_test(df)
df.info();

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 79 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MouseID          1095 non-null   object 
 1   DYRK1A_N         1095 non-null   float64
 2   ITSN1_N          1095 non-null   float64
 3   BDNF_N           1095 non-null   float64
 4   NR1_N            1095 non-null   float64
 5   NR2A_N           1095 non-null   float64
 6   pAKT_N           1095 non-null   float64
 7   pBRAF_N          1095 non-null   float64
 8   pCAMKII_N        1095 non-null   float64
 9   pCREB_N          1095 non-null   float64
 10  pELK_N           1095 non-null   float64
 11  pERK_N           1095 non-null   float64
 12  pJNK_N           1095 non-null   float64
 13  PKCA_N           1095 non-null   float64
 14  pMEK_N           1095 non-null   float64
 15  pNR1_N           1095 non-null   float64
 16  pNR2A_N          1095 non-null   float64
 17  pNR2B_N       

  data.fillna(df.median(), inplace=True)


In [10]:
df.isnull().sum()

MouseID      0
DYRK1A_N     0
ITSN1_N      0
BDNF_N       0
NR1_N        0
            ..
H3AcK18_N    0
EGR1_N       0
H3MeK4_N     0
CaNA_N       0
class        0
Length: 79, dtype: int64

In [11]:
# Classes:
# c-CS-s: control mice, stimulated to learn, injected with saline (9 mice)
# c-CS-m: control mice, stimulated to learn, injected with memantine (10 mice)
# c-SC-s: control mice, not stimulated to learn, injected with saline (9 mice)
# c-SC-m: control mice, not stimulated to learn, injected with memantine (10 mice)

# t-CS-s: trisomy mice, stimulated to learn, injected with saline (7 mice)
# t-CS-m: trisomy mice, stimulated to learn, injected with memantine (9 mice)
# t-SC-s: trisomy mice, not stimulated to learn, injected with saline (9 mice)
# t-SC-m: trisomy mice, not stimulated to learn, injected with memantine (9 mice)

In [12]:
df.head(5)

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,class
0,309_1,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,...,0.122652,0.129468,0.106305,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652,c-CS-m
1,309_2,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,...,0.116682,0.129468,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361,c-CS-m
2,309_3,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,...,0.118508,0.129468,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,c-CS-m
3,309_4,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,...,0.132781,0.129468,0.103184,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,c-CS-m
4,309_5,0.43494,0.61743,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,...,0.129954,0.129468,0.104784,0.110694,0.434154,0.118481,0.140314,0.14838,1.83973,c-CS-m


In [13]:
da = df.drop(['MouseID'], axis=1)
da

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,class
0,0.503644,0.747193,0.430175,2.816329,5.990152,0.218830,0.177565,2.373744,0.232224,1.750936,...,0.122652,0.129468,0.106305,0.108336,0.427099,0.114783,0.131790,0.128186,1.675652,c-CS-m
1,0.514617,0.689064,0.411770,2.789514,5.685038,0.211636,0.172817,2.292150,0.226972,1.596377,...,0.116682,0.129468,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.743610,c-CS-m
2,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,1.561316,...,0.118508,0.129468,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,c-CS-m
3,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,1.595086,...,0.132781,0.129468,0.103184,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,c-CS-m
4,0.434940,0.617430,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,1.504230,...,0.129954,0.129468,0.104784,0.110694,0.434154,0.118481,0.140314,0.148380,1.839730,c-CS-m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,0.254860,0.463591,0.254860,2.092082,2.600035,0.211736,0.171262,2.483740,0.207317,1.057971,...,0.190483,0.129468,0.115806,0.183324,0.374088,0.318782,0.204660,0.328327,1.364823,t-SC-s
1091,0.272198,0.474163,0.251638,2.161390,2.801492,0.251274,0.182496,2.512737,0.216339,1.081150,...,0.190463,0.129468,0.113614,0.175674,0.375259,0.325639,0.200415,0.293435,1.364478,t-SC-s
1092,0.228700,0.395179,0.234118,1.733184,2.220852,0.220665,0.161435,1.989723,0.185164,0.884342,...,0.216682,0.129468,0.118948,0.158296,0.422121,0.321306,0.229193,0.355213,1.430825,t-SC-s
1093,0.221242,0.412894,0.243974,1.876347,2.384088,0.208897,0.173623,2.086028,0.192044,0.922595,...,0.222263,0.129468,0.125295,0.196296,0.397676,0.335936,0.251317,0.365353,1.404031,t-SC-s


In [14]:
list(da.columns)

['DYRK1A_N',
 'ITSN1_N',
 'BDNF_N',
 'NR1_N',
 'NR2A_N',
 'pAKT_N',
 'pBRAF_N',
 'pCAMKII_N',
 'pCREB_N',
 'pELK_N',
 'pERK_N',
 'pJNK_N',
 'PKCA_N',
 'pMEK_N',
 'pNR1_N',
 'pNR2A_N',
 'pNR2B_N',
 'pPKCAB_N',
 'pRSK_N',
 'AKT_N',
 'BRAF_N',
 'CAMKII_N',
 'CREB_N',
 'ELK_N',
 'ERK_N',
 'GSK3B_N',
 'JNK_N',
 'MEK_N',
 'TRKA_N',
 'RSK_N',
 'APP_N',
 'Bcatenin_N',
 'SOD1_N',
 'MTOR_N',
 'P38_N',
 'pMTOR_N',
 'DSCR1_N',
 'AMPKA_N',
 'NR2B_N',
 'pNUMB_N',
 'RAPTOR_N',
 'TIAM1_N',
 'pP70S6_N',
 'NUMB_N',
 'P70S6_N',
 'pGSK3B_N',
 'pPKCG_N',
 'CDK5_N',
 'S6_N',
 'ADARB1_N',
 'AcetylH3K9_N',
 'RRP1_N',
 'BAX_N',
 'ARC_N',
 'ERBB4_N',
 'nNOS_N',
 'Tau_N',
 'GFAP_N',
 'GluR3_N',
 'GluR4_N',
 'IL1B_N',
 'P3525_N',
 'pCASP9_N',
 'PSD95_N',
 'SNCA_N',
 'Ubiquitin_N',
 'pGSK3B_Tyr216_N',
 'SHH_N',
 'BAD_N',
 'BCL2_N',
 'pS6_N',
 'pCFOS_N',
 'SYP_N',
 'H3AcK18_N',
 'EGR1_N',
 'H3MeK4_N',
 'CaNA_N',
 'class']

In [15]:
# import scipy.stats as stats
# #find Q1, Q3, and interquartile range for each column
# def outlier_remove(x = None, columns = None):
#     """"""
#     Q1 = x[columns].quantile(q=.25)
#     Q3 = x[columns].quantile(q=.75)
#     IQR = Q3 - Q1
#     #only keep rows in dataframe that have values within 1.5*IQR of Q1 and Q3
#     outlier_data = ~((x[columns] < (Q1-1.5*IQR)) | (x[columns] > (Q3+1.5*IQR))).any(axis=1)
#     #find how many rows are left in the dataframe 
#     filter_data = x[outlier_data]
#     return filter_data

In [16]:
columns = ['DYRK1A_N','ITSN1_N','BDNF_N','NR1_N','NR2A_N','pAKT_N','pBRAF_N','pCAMKII_N','pCREB_N', 'pELK_N',
 'pERK_N', 'pJNK_N', 'PKCA_N','pMEK_N','pNR1_N','pNR2A_N','pNR2B_N','pPKCAB_N','pRSK_N','AKT_N','BRAF_N','CAMKII_N',
 'CREB_N','ELK_N','ERK_N','GSK3B_N','JNK_N','MEK_N','TRKA_N','RSK_N','APP_N','Bcatenin_N','SOD1_N','MTOR_N',
 'P38_N','pMTOR_N','DSCR1_N','AMPKA_N','NR2B_N','pNUMB_N','RAPTOR_N','TIAM1_N','pP70S6_N','NUMB_N','P70S6_N','pGSK3B_N',
 'pPKCG_N','CDK5_N', 'S6_N', 'ADARB1_N', 'AcetylH3K9_N', 'RRP1_N', 'BAX_N', 'ARC_N', 'ERBB4_N', 'nNOS_N', 'Tau_N',
 'GFAP_N','GluR3_N', 'GluR4_N', 'IL1B_N', 'P3525_N', 'pCASP9_N', 'PSD95_N', 'SNCA_N', 'Ubiquitin_N', 'pGSK3B_Tyr216_N', 'SHH_N',
 'BAD_N', 'BCL2_N','pS6_N', 'pCFOS_N', 'SYP_N', 'H3AcK18_N', 'EGR1_N', 'H3MeK4_N','CaNA_N']
# out_da = outlier_remove(da, columns)

In [17]:
# out_da

In [18]:
def balanced_data(data: pd.DataFrame ):
    """
    function to split our datasets and 
    correct the imbalances of classes o
    bserved in the datasets"""
    from collections import Counter
    from imblearn.over_sampling import SMOTE
    from sklearn.preprocessing import LabelEncoder, MinMaxScaler
    df = data.values
    
    X, y = df[:,:-1], df[:,-1]
    # y = LabelEncoder().fit_transform(y)
    # oversample = SMOTE()
    # X, y = oversample.fit_resample(X, y)
    counter = Counter(y)
    for k,v in counter.items():
        per = v / len(y) * 100
        print('Class=%s, n=%s (%.3f%%)' % (k, v, per))
    return X, y
    # plot the distribution
    # plt.bar(counter.keys(), counter.values())
    # plt.show()

    
     

In [19]:
X, y = balanced_data(da)

Class=c-CS-m, n=150 (13.699%)
Class=c-SC-m, n=150 (13.699%)
Class=c-CS-s, n=135 (12.329%)
Class=c-SC-s, n=135 (12.329%)
Class=t-CS-m, n=150 (13.699%)
Class=t-SC-m, n=135 (12.329%)
Class=t-CS-s, n=105 (9.589%)
Class=t-SC-s, n=135 (12.329%)


In [20]:
y

array(['c-CS-m', 'c-CS-m', 'c-CS-m', ..., 't-SC-s', 't-SC-s', 't-SC-s'],
      dtype=object)

In [21]:
df = pd.DataFrame(X, columns = ['DYRK1A_N','ITSN1_N','BDNF_N','NR1_N',
 'NR2A_N',
 'pAKT_N',
 'pBRAF_N',
 'pCAMKII_N',
 'pCREB_N',
 'pELK_N',
 'pERK_N',
 'pJNK_N',
 'PKCA_N',
 'pMEK_N',
 'pNR1_N',
 'pNR2A_N',
 'pNR2B_N',
 'pPKCAB_N',
 'pRSK_N',
 'AKT_N',
 'BRAF_N',
 'CAMKII_N',
 'CREB_N',
 'ELK_N',
 'ERK_N',
 'GSK3B_N',
 'JNK_N',
 'MEK_N',
 'TRKA_N',
 'RSK_N',
 'APP_N',
 'Bcatenin_N',
 'SOD1_N',
 'MTOR_N',
 'P38_N',
 'pMTOR_N',
 'DSCR1_N',
 'AMPKA_N',
 'NR2B_N',
 'pNUMB_N',
 'RAPTOR_N',
 'TIAM1_N',
 'pP70S6_N',
 'NUMB_N',
 'P70S6_N',
 'pGSK3B_N',
 'pPKCG_N',
 'CDK5_N',
 'S6_N',
 'ADARB1_N',
 'AcetylH3K9_N',
 'RRP1_N',
 'BAX_N',
 'ARC_N',
 'ERBB4_N',
 'nNOS_N',
 'Tau_N',
 'GFAP_N',
 'GluR3_N',
 'GluR4_N',
 'IL1B_N',
 'P3525_N',
 'pCASP9_N',
 'PSD95_N',
 'SNCA_N',
 'Ubiquitin_N',
 'pGSK3B_Tyr216_N',
 'SHH_N',
 'BAD_N',
 'BCL2_N',
 'pS6_N',
 'pCFOS_N',
 'SYP_N',
 'H3AcK18_N',
 'EGR1_N',
 'H3MeK4_N',
 'CaNA_N'])

In [22]:
df 

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,SHH_N,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N
0,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,1.750936,...,0.188852,0.122652,0.129468,0.106305,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652
1,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,1.596377,...,0.200404,0.116682,0.129468,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361
2,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,1.561316,...,0.193685,0.118508,0.129468,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427
3,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,1.595086,...,0.192112,0.132781,0.129468,0.103184,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563
4,0.43494,0.61743,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,1.50423,...,0.205604,0.129954,0.129468,0.104784,0.110694,0.434154,0.118481,0.140314,0.14838,1.83973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,0.25486,0.463591,0.25486,2.092082,2.600035,0.211736,0.171262,2.48374,0.207317,1.057971,...,0.275547,0.190483,0.129468,0.115806,0.183324,0.374088,0.318782,0.20466,0.328327,1.364823
1091,0.272198,0.474163,0.251638,2.16139,2.801492,0.251274,0.182496,2.512737,0.216339,1.08115,...,0.283207,0.190463,0.129468,0.113614,0.175674,0.375259,0.325639,0.200415,0.293435,1.364478
1092,0.2287,0.395179,0.234118,1.733184,2.220852,0.220665,0.161435,1.989723,0.185164,0.884342,...,0.290843,0.216682,0.129468,0.118948,0.158296,0.422121,0.321306,0.229193,0.355213,1.430825
1093,0.221242,0.412894,0.243974,1.876347,2.384088,0.208897,0.173623,2.086028,0.192044,0.922595,...,0.306701,0.222263,0.129468,0.125295,0.196296,0.397676,0.335936,0.251317,0.365353,1.404031


In [23]:
Y = pd.DataFrame(y, columns = ['Class'])

In [24]:
Y 

Unnamed: 0,Class
0,c-CS-m
1,c-CS-m
2,c-CS-m
3,c-CS-m
4,c-CS-m
...,...
1090,t-SC-s
1091,t-SC-s
1092,t-SC-s
1093,t-SC-s


In [25]:
result = pd.concat([df, Y], axis=1, join='inner')

In [26]:
result

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Class
0,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,1.750936,...,0.122652,0.129468,0.106305,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652,c-CS-m
1,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,1.596377,...,0.116682,0.129468,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361,c-CS-m
2,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,1.561316,...,0.118508,0.129468,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,c-CS-m
3,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,1.595086,...,0.132781,0.129468,0.103184,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,c-CS-m
4,0.43494,0.61743,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,1.50423,...,0.129954,0.129468,0.104784,0.110694,0.434154,0.118481,0.140314,0.14838,1.83973,c-CS-m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,0.25486,0.463591,0.25486,2.092082,2.600035,0.211736,0.171262,2.48374,0.207317,1.057971,...,0.190483,0.129468,0.115806,0.183324,0.374088,0.318782,0.20466,0.328327,1.364823,t-SC-s
1091,0.272198,0.474163,0.251638,2.16139,2.801492,0.251274,0.182496,2.512737,0.216339,1.08115,...,0.190463,0.129468,0.113614,0.175674,0.375259,0.325639,0.200415,0.293435,1.364478,t-SC-s
1092,0.2287,0.395179,0.234118,1.733184,2.220852,0.220665,0.161435,1.989723,0.185164,0.884342,...,0.216682,0.129468,0.118948,0.158296,0.422121,0.321306,0.229193,0.355213,1.430825,t-SC-s
1093,0.221242,0.412894,0.243974,1.876347,2.384088,0.208897,0.173623,2.086028,0.192044,0.922595,...,0.222263,0.129468,0.125295,0.196296,0.397676,0.335936,0.251317,0.365353,1.404031,t-SC-s


In [27]:
result.to_csv("/home/miki/Desktop/Docker/iNeuron/mice-protien-expression/data/final/data.csv")

In [28]:
df_trisomy = result[result['Class'].str.startswith('t')]

In [29]:
df_control = result[result['Class'].str.startswith('c')]

In [30]:
df_trisomy.head()

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Class
570,0.844219,0.94929,0.39273,1.170886,2.840236,0.25139,0.169143,2.08316,0.428417,1.614906,...,0.152313,0.129468,0.121626,0.126523,0.448459,0.15824,0.174935,0.193994,1.317441,t-CS-m
571,0.861588,0.965523,0.398587,1.181551,2.879162,0.251514,0.167592,2.15481,0.418937,1.631769,...,0.152313,0.129468,0.121626,0.126523,0.448459,0.15824,0.174935,0.193994,1.317441,t-CS-m
572,0.844082,0.967457,0.401478,1.194239,2.984791,0.254822,0.173592,2.153454,0.41108,1.626476,...,0.152313,0.129468,0.121626,0.126523,0.448459,0.15824,0.174935,0.193994,1.317441,t-CS-m
573,0.706132,0.821736,0.342455,1.065341,2.290964,0.265944,0.161957,2.205071,0.382107,1.51089,...,0.152313,0.129468,0.121626,0.126523,0.448459,0.15824,0.174935,0.193994,1.317441,t-CS-m
574,0.695193,0.842543,0.348495,1.08558,2.472372,0.268531,0.162624,2.237084,0.381065,1.493374,...,0.152313,0.129468,0.121626,0.126523,0.448459,0.15824,0.174935,0.193994,1.317441,t-CS-m


In [31]:
df_trisomy.to_csv('/home/miki/Desktop/Docker/iNeuron/mice-protien-expression/data/intermediate/trisomy.csv')

In [32]:
df_control.to_csv('/home/miki/Desktop/Docker/iNeuron/mice-protien-expression/data/intermediate/control.csv')

In [33]:
df_control.info

<bound method DataFrame.info of      DYRK1A_N   ITSN1_N    BDNF_N     NR1_N    NR2A_N    pAKT_N   pBRAF_N  \
0    0.503644  0.747193  0.430175  2.816329  5.990152   0.21883  0.177565   
1    0.514617  0.689064   0.41177  2.789514  5.685038  0.211636  0.172817   
2    0.509183  0.730247  0.418309  2.687201  5.622059  0.209011  0.175722   
3    0.442107  0.617076  0.358626  2.466947  4.979503  0.222886  0.176463   
4     0.43494   0.61743  0.358802  2.365785  4.718679  0.213106  0.173627   
..        ...       ...       ...       ...       ...       ...       ...   
565  0.291498  0.385965  0.350877  2.312126  4.232697  0.249084  0.215539   
566  0.271357  0.396405  0.356204  2.352725  4.211828  0.254928  0.186316   
567  0.253623  0.368713  0.386189  2.157502  3.840793  0.272805  0.199062   
568  0.237821  0.371795  0.342094  2.206197  3.952778  0.253205  0.200427   
569  0.236523    0.3715  0.347054  2.179691   3.91496  0.255119  0.188466   

    pCAMKII_N   pCREB_N    pELK_N  ...     

In [34]:
df_trisomy.info

<bound method DataFrame.info of       DYRK1A_N   ITSN1_N    BDNF_N     NR1_N    NR2A_N    pAKT_N   pBRAF_N  \
570   0.844219   0.94929   0.39273  1.170886  2.840236   0.25139  0.169143   
571   0.861588  0.965523  0.398587  1.181551  2.879162  0.251514  0.167592   
572   0.844082  0.967457  0.401478  1.194239  2.984791  0.254822  0.173592   
573   0.706132  0.821736  0.342455  1.065341  2.290964  0.265944  0.161957   
574   0.695193  0.842543  0.348495   1.08558  2.472372  0.268531  0.162624   
...        ...       ...       ...       ...       ...       ...       ...   
1090   0.25486  0.463591   0.25486  2.092082  2.600035  0.211736  0.171262   
1091  0.272198  0.474163  0.251638   2.16139  2.801492  0.251274  0.182496   
1092    0.2287  0.395179  0.234118  1.733184  2.220852  0.220665  0.161435   
1093  0.221242  0.412894  0.243974  1.876347  2.384088  0.208897  0.173623   
1094  0.302626  0.461059  0.256564   2.09279  2.594348  0.251001  0.191811   

     pCAMKII_N   pCREB_N    pEL

In [35]:
def data_split(data: pd.DataFrame, frac = 0.2, trainpath= str, testpath=str):
    """
    """
    test = data.sample(frac = frac, axis = 0)
    train = data.drop(index=test.index)
    train.to_csv(trainpath)
    test.to_csv(testpath)

In [36]:
train_path = "/home/miki/Desktop/Docker/iNeuron/mice-protien-expression/data/final/train.csv"
test_path = "/home/miki/Desktop/Docker/iNeuron/mice-protien-expression/data/final/test.csv"
data_split(result, trainpath=train_path, testpath=test_path)