### Exploratory Data Analysis
---

#### Problem statement
---
##### > Finding protein subsets that distinguish between the classes is the goal.

In [62]:
INPUTDATA="/home/miki/Desktop/Docker/iNeuron/mice-protien-expression/data/raw/Data_Cortex_Nuclear.xlsx"

In [63]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Show plots in jupyter notebook
%matplotlib inline
# Set plot style
sns.set(color_codes=True)

In [64]:
def import_data(path: str = None, columns: list = None):
    """
    Function to import a dataset into 
    the working environment
    --------------------------------
    Parameter
    ----------
    path : str
    Usually a string with the path of the data
    --------------------------------
    Returns
    -------
    data : dataframe
    Returns a pandas dataframe
     """
    data = pd.read_excel(path)
    if columns:
        data = data.drop(columns, axis = 1)
    return data

In [65]:
col = ['Genotype', 'Treatment', 'Behavior']
df = import_data(INPUTDATA, col)
df.head(3)

  warn(msg)


Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,class
0,309_1,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,...,0.122652,,0.106305,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652,c-CS-m
1,309_2,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,...,0.116682,,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361,c-CS-m
2,309_3,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,...,0.118508,,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,c-CS-m


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 79 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MouseID          1095 non-null   object 
 1   DYRK1A_N         1092 non-null   float64
 2   ITSN1_N          1092 non-null   float64
 3   BDNF_N           1092 non-null   float64
 4   NR1_N            1092 non-null   float64
 5   NR2A_N           1092 non-null   float64
 6   pAKT_N           1092 non-null   float64
 7   pBRAF_N          1092 non-null   float64
 8   pCAMKII_N        1092 non-null   float64
 9   pCREB_N          1092 non-null   float64
 10  pELK_N           1092 non-null   float64
 11  pERK_N           1092 non-null   float64
 12  pJNK_N           1092 non-null   float64
 13  PKCA_N           1092 non-null   float64
 14  pMEK_N           1092 non-null   float64
 15  pNR1_N           1092 non-null   float64
 16  pNR2A_N          1092 non-null   float64
 17  pNR2B_N       

In [67]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DYRK1A_N,1092.0,0.429418,0.249792,0.145327,0.289128,0.368134,0.492861,2.516367
ITSN1_N,1092.0,0.619503,0.251025,0.245359,0.474159,0.568502,0.702111,2.602662
BDNF_N,1092.0,0.319422,0.049246,0.115181,0.288131,0.316923,0.348235,0.497160
NR1_N,1092.0,2.280357,0.373589,0.964543,2.050378,2.291166,2.526946,3.757641
NR2A_N,1092.0,3.822367,0.945597,1.711370,3.129370,3.722900,4.408332,8.482553
...,...,...,...,...,...,...,...,...
SYP_N,1080.0,0.446073,0.066432,0.258626,0.398082,0.448459,0.490773,0.759588
H3AcK18_N,900.0,0.169609,0.059402,0.079691,0.125848,0.158240,0.197876,0.479763
EGR1_N,870.0,0.183135,0.040406,0.105537,0.155121,0.174935,0.204542,0.360692
H3MeK4_N,810.0,0.205440,0.055514,0.101787,0.165143,0.193994,0.235215,0.413903


In [68]:
df.isnull().sum()

MouseID        0
DYRK1A_N       3
ITSN1_N        3
BDNF_N         3
NR1_N          3
            ... 
H3AcK18_N    195
EGR1_N       225
H3MeK4_N     285
CaNA_N        15
class          0
Length: 79, dtype: int64

In [69]:

def null_test(data: pd.DataFrame=None):
    """ 
    A function to fill the Null values with
    the mean of our dataset
    ---------
    Parameters
    param: pandas dataframe
    ----------
    Returns
    a dataframe cleaned of missing values
    -----------
    """
    
    if data.isnull().any().any():
        data.fillna(df.median(), inplace=True)
    else:
        print("No missing values encountered")  
    return data

In [70]:
df = null_test(df)
df.info();

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 79 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MouseID          1095 non-null   object 
 1   DYRK1A_N         1095 non-null   float64
 2   ITSN1_N          1095 non-null   float64
 3   BDNF_N           1095 non-null   float64
 4   NR1_N            1095 non-null   float64
 5   NR2A_N           1095 non-null   float64
 6   pAKT_N           1095 non-null   float64
 7   pBRAF_N          1095 non-null   float64
 8   pCAMKII_N        1095 non-null   float64
 9   pCREB_N          1095 non-null   float64
 10  pELK_N           1095 non-null   float64
 11  pERK_N           1095 non-null   float64
 12  pJNK_N           1095 non-null   float64
 13  PKCA_N           1095 non-null   float64
 14  pMEK_N           1095 non-null   float64
 15  pNR1_N           1095 non-null   float64
 16  pNR2A_N          1095 non-null   float64
 17  pNR2B_N       

  data.fillna(df.median(), inplace=True)


In [71]:
df.isnull().sum()

MouseID      0
DYRK1A_N     0
ITSN1_N      0
BDNF_N       0
NR1_N        0
            ..
H3AcK18_N    0
EGR1_N       0
H3MeK4_N     0
CaNA_N       0
class        0
Length: 79, dtype: int64

In [72]:
# Classes:
# c-CS-s: control mice, stimulated to learn, injected with saline (9 mice)
# c-CS-m: control mice, stimulated to learn, injected with memantine (10 mice)
# c-SC-s: control mice, not stimulated to learn, injected with saline (9 mice)
# c-SC-m: control mice, not stimulated to learn, injected with memantine (10 mice)

# t-CS-s: trisomy mice, stimulated to learn, injected with saline (7 mice)
# t-CS-m: trisomy mice, stimulated to learn, injected with memantine (9 mice)
# t-SC-s: trisomy mice, not stimulated to learn, injected with saline (9 mice)
# t-SC-m: trisomy mice, not stimulated to learn, injected with memantine (9 mice)

In [73]:
df.head(5)

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,class
0,309_1,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,...,0.122652,0.129468,0.106305,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652,c-CS-m
1,309_2,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,...,0.116682,0.129468,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361,c-CS-m
2,309_3,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,...,0.118508,0.129468,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,c-CS-m
3,309_4,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,...,0.132781,0.129468,0.103184,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,c-CS-m
4,309_5,0.43494,0.61743,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,...,0.129954,0.129468,0.104784,0.110694,0.434154,0.118481,0.140314,0.14838,1.83973,c-CS-m


In [74]:
da = df.drop(['MouseID'], axis=1)
da

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,class
0,0.503644,0.747193,0.430175,2.816329,5.990152,0.218830,0.177565,2.373744,0.232224,1.750936,...,0.122652,0.129468,0.106305,0.108336,0.427099,0.114783,0.131790,0.128186,1.675652,c-CS-m
1,0.514617,0.689064,0.411770,2.789514,5.685038,0.211636,0.172817,2.292150,0.226972,1.596377,...,0.116682,0.129468,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.743610,c-CS-m
2,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,1.561316,...,0.118508,0.129468,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,c-CS-m
3,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,1.595086,...,0.132781,0.129468,0.103184,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,c-CS-m
4,0.434940,0.617430,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,1.504230,...,0.129954,0.129468,0.104784,0.110694,0.434154,0.118481,0.140314,0.148380,1.839730,c-CS-m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,0.254860,0.463591,0.254860,2.092082,2.600035,0.211736,0.171262,2.483740,0.207317,1.057971,...,0.190483,0.129468,0.115806,0.183324,0.374088,0.318782,0.204660,0.328327,1.364823,t-SC-s
1091,0.272198,0.474163,0.251638,2.161390,2.801492,0.251274,0.182496,2.512737,0.216339,1.081150,...,0.190463,0.129468,0.113614,0.175674,0.375259,0.325639,0.200415,0.293435,1.364478,t-SC-s
1092,0.228700,0.395179,0.234118,1.733184,2.220852,0.220665,0.161435,1.989723,0.185164,0.884342,...,0.216682,0.129468,0.118948,0.158296,0.422121,0.321306,0.229193,0.355213,1.430825,t-SC-s
1093,0.221242,0.412894,0.243974,1.876347,2.384088,0.208897,0.173623,2.086028,0.192044,0.922595,...,0.222263,0.129468,0.125295,0.196296,0.397676,0.335936,0.251317,0.365353,1.404031,t-SC-s


In [75]:
list(da.columns)

['DYRK1A_N',
 'ITSN1_N',
 'BDNF_N',
 'NR1_N',
 'NR2A_N',
 'pAKT_N',
 'pBRAF_N',
 'pCAMKII_N',
 'pCREB_N',
 'pELK_N',
 'pERK_N',
 'pJNK_N',
 'PKCA_N',
 'pMEK_N',
 'pNR1_N',
 'pNR2A_N',
 'pNR2B_N',
 'pPKCAB_N',
 'pRSK_N',
 'AKT_N',
 'BRAF_N',
 'CAMKII_N',
 'CREB_N',
 'ELK_N',
 'ERK_N',
 'GSK3B_N',
 'JNK_N',
 'MEK_N',
 'TRKA_N',
 'RSK_N',
 'APP_N',
 'Bcatenin_N',
 'SOD1_N',
 'MTOR_N',
 'P38_N',
 'pMTOR_N',
 'DSCR1_N',
 'AMPKA_N',
 'NR2B_N',
 'pNUMB_N',
 'RAPTOR_N',
 'TIAM1_N',
 'pP70S6_N',
 'NUMB_N',
 'P70S6_N',
 'pGSK3B_N',
 'pPKCG_N',
 'CDK5_N',
 'S6_N',
 'ADARB1_N',
 'AcetylH3K9_N',
 'RRP1_N',
 'BAX_N',
 'ARC_N',
 'ERBB4_N',
 'nNOS_N',
 'Tau_N',
 'GFAP_N',
 'GluR3_N',
 'GluR4_N',
 'IL1B_N',
 'P3525_N',
 'pCASP9_N',
 'PSD95_N',
 'SNCA_N',
 'Ubiquitin_N',
 'pGSK3B_Tyr216_N',
 'SHH_N',
 'BAD_N',
 'BCL2_N',
 'pS6_N',
 'pCFOS_N',
 'SYP_N',
 'H3AcK18_N',
 'EGR1_N',
 'H3MeK4_N',
 'CaNA_N',
 'class']

In [76]:
import scipy.stats as stats
#find Q1, Q3, and interquartile range for each column
def outlier_remove(x = None, columns = None):
    """"""
    Q1 = x[columns].quantile(q=.25)
    Q3 = x[columns].quantile(q=.75)
    IQR = Q3 - Q1
    #only keep rows in dataframe that have values within 1.5*IQR of Q1 and Q3
    outlier_data = ~((x[columns] < (Q1-1.5*IQR)) | (x[columns] > (Q3+1.5*IQR))).any(axis=1)
    #find how many rows are left in the dataframe 
    filter_data = x[outlier_data]
    return filter_data

In [77]:
columns = ['DYRK1A_N','ITSN1_N','BDNF_N','NR1_N','NR2A_N','pAKT_N','pBRAF_N','pCAMKII_N','pCREB_N', 'pELK_N',
 'pERK_N', 'pJNK_N', 'PKCA_N','pMEK_N','pNR1_N','pNR2A_N','pNR2B_N','pPKCAB_N','pRSK_N','AKT_N','BRAF_N','CAMKII_N',
 'CREB_N','ELK_N','ERK_N','GSK3B_N','JNK_N','MEK_N','TRKA_N','RSK_N','APP_N','Bcatenin_N','SOD1_N','MTOR_N',
 'P38_N','pMTOR_N','DSCR1_N','AMPKA_N','NR2B_N','pNUMB_N','RAPTOR_N','TIAM1_N','pP70S6_N','NUMB_N','P70S6_N','pGSK3B_N',
 'pPKCG_N','CDK5_N', 'S6_N', 'ADARB1_N', 'AcetylH3K9_N', 'RRP1_N', 'BAX_N', 'ARC_N', 'ERBB4_N', 'nNOS_N', 'Tau_N',
 'GFAP_N','GluR3_N', 'GluR4_N', 'IL1B_N', 'P3525_N', 'pCASP9_N', 'PSD95_N', 'SNCA_N', 'Ubiquitin_N', 'pGSK3B_Tyr216_N', 'SHH_N',
 'BAD_N', 'BCL2_N','pS6_N', 'pCFOS_N', 'SYP_N', 'H3AcK18_N', 'EGR1_N', 'H3MeK4_N','CaNA_N']
out_da = outlier_remove(da, columns)

In [78]:
out_da

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,class
0,0.503644,0.747193,0.430175,2.816329,5.990152,0.218830,0.177565,2.373744,0.232224,1.750936,...,0.122652,0.129468,0.106305,0.108336,0.427099,0.114783,0.131790,0.128186,1.675652,c-CS-m
1,0.514617,0.689064,0.411770,2.789514,5.685038,0.211636,0.172817,2.292150,0.226972,1.596377,...,0.116682,0.129468,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.743610,c-CS-m
2,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,1.561316,...,0.118508,0.129468,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,c-CS-m
3,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,1.595086,...,0.132781,0.129468,0.103184,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,c-CS-m
4,0.434940,0.617430,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,1.504230,...,0.129954,0.129468,0.104784,0.110694,0.434154,0.118481,0.140314,0.148380,1.839730,c-CS-m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1054,0.430795,0.667131,0.425734,2.637525,5.378416,0.294028,0.216346,6.951164,0.264423,1.534666,...,0.158648,0.131295,0.139711,0.113200,0.471244,0.167415,0.172605,0.185299,1.171693,t-SC-s
1056,0.382195,0.603457,0.383499,2.470895,4.552258,0.289092,0.201859,6.381706,0.244742,1.401761,...,0.170716,0.111824,0.128082,0.121218,0.474573,0.147954,0.160148,0.191672,1.175504,t-SC-s
1057,0.399864,0.637814,0.405295,2.652580,5.181602,0.304650,0.208079,6.666327,0.253904,1.527665,...,0.172736,0.121915,0.133566,0.116320,0.481607,0.157967,0.182736,0.206862,1.196037,t-SC-s
1060,0.382149,0.595792,0.399101,2.527574,4.629493,0.317810,0.213235,6.219567,0.232639,1.495507,...,0.184612,0.133317,0.132591,0.122308,0.507017,0.172393,0.200097,0.221994,1.141423,t-SC-s


In [79]:
def balanced_data(data: pd.DataFrame ):
    """
    function to split our datasets and 
    correct the imbalances of classes o
    bserved in the datasets"""
    from collections import Counter
    from imblearn.over_sampling import SMOTE
    from sklearn.preprocessing import LabelEncoder, MinMaxScaler
    df = data.values
    
    X, y = df[:,:-1], df[:,-1]
    # y = LabelEncoder().fit_transform(y)
    oversample = SMOTE()
    X, y = oversample.fit_resample(X, y)
    counter = Counter(y)
    for k,v in counter.items():
        per = v / len(y) * 100
        print('Class=%s, n=%s (%.3f%%)' % (k, v, per))
    return X, y
    # plot the distribution
    # plt.bar(counter.keys(), counter.values())
    # plt.show()

    
     

In [80]:
X, y = balanced_data(out_da)

Class=c-CS-m, n=100 (12.500%)
Class=c-SC-m, n=100 (12.500%)
Class=c-CS-s, n=100 (12.500%)
Class=c-SC-s, n=100 (12.500%)
Class=t-CS-m, n=100 (12.500%)
Class=t-SC-m, n=100 (12.500%)
Class=t-CS-s, n=100 (12.500%)
Class=t-SC-s, n=100 (12.500%)


In [81]:
y

array(['c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m',
       'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS-m', 'c-CS

In [82]:
df = pd.DataFrame(X, columns = ['DYRK1A_N','ITSN1_N','BDNF_N','NR1_N',
 'NR2A_N',
 'pAKT_N',
 'pBRAF_N',
 'pCAMKII_N',
 'pCREB_N',
 'pELK_N',
 'pERK_N',
 'pJNK_N',
 'PKCA_N',
 'pMEK_N',
 'pNR1_N',
 'pNR2A_N',
 'pNR2B_N',
 'pPKCAB_N',
 'pRSK_N',
 'AKT_N',
 'BRAF_N',
 'CAMKII_N',
 'CREB_N',
 'ELK_N',
 'ERK_N',
 'GSK3B_N',
 'JNK_N',
 'MEK_N',
 'TRKA_N',
 'RSK_N',
 'APP_N',
 'Bcatenin_N',
 'SOD1_N',
 'MTOR_N',
 'P38_N',
 'pMTOR_N',
 'DSCR1_N',
 'AMPKA_N',
 'NR2B_N',
 'pNUMB_N',
 'RAPTOR_N',
 'TIAM1_N',
 'pP70S6_N',
 'NUMB_N',
 'P70S6_N',
 'pGSK3B_N',
 'pPKCG_N',
 'CDK5_N',
 'S6_N',
 'ADARB1_N',
 'AcetylH3K9_N',
 'RRP1_N',
 'BAX_N',
 'ARC_N',
 'ERBB4_N',
 'nNOS_N',
 'Tau_N',
 'GFAP_N',
 'GluR3_N',
 'GluR4_N',
 'IL1B_N',
 'P3525_N',
 'pCASP9_N',
 'PSD95_N',
 'SNCA_N',
 'Ubiquitin_N',
 'pGSK3B_Tyr216_N',
 'SHH_N',
 'BAD_N',
 'BCL2_N',
 'pS6_N',
 'pCFOS_N',
 'SYP_N',
 'H3AcK18_N',
 'EGR1_N',
 'H3MeK4_N',
 'CaNA_N'])

In [83]:
df 

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,SHH_N,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N
0,0.503644,0.747193,0.430175,2.816329,5.990152,0.218830,0.177565,2.373744,0.232224,1.750936,...,0.188852,0.122652,0.129468,0.106305,0.108336,0.427099,0.114783,0.131790,0.128186,1.675652
1,0.514617,0.689064,0.411770,2.789514,5.685038,0.211636,0.172817,2.292150,0.226972,1.596377,...,0.200404,0.116682,0.129468,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.743610
2,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,1.561316,...,0.193685,0.118508,0.129468,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427
3,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,1.595086,...,0.192112,0.132781,0.129468,0.103184,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563
4,0.434940,0.617430,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,1.504230,...,0.205604,0.129954,0.129468,0.104784,0.110694,0.434154,0.118481,0.140314,0.148380,1.839730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.391371,0.621326,0.315093,2.100006,3.279238,0.235904,0.200477,3.195154,0.222292,1.219100,...,0.217060,0.157950,0.111824,0.119654,0.115069,0.439544,0.150943,0.148158,0.192194,1.478935
796,0.239972,0.419969,0.208695,1.515668,2.003866,0.167836,0.137255,1.774287,0.158244,0.790943,...,0.225064,0.192606,0.132316,0.109677,0.126086,0.327594,0.196506,0.196010,0.211881,1.019969
797,0.365591,0.572986,0.297292,2.005589,2.791422,0.244081,0.189986,2.949005,0.217188,1.172087,...,0.198535,0.158657,0.136461,0.120638,0.121115,0.458330,0.147485,0.141477,0.212982,1.535949
798,0.444061,0.567609,0.371142,2.231136,3.347213,0.264013,0.212638,5.416038,0.234061,1.292121,...,0.244222,0.176097,0.147626,0.115801,0.149439,0.470907,0.170620,0.180922,0.220740,1.266460


In [84]:
Y = pd.DataFrame(y, columns = ['Class'])

In [85]:
Y 

Unnamed: 0,Class
0,c-CS-m
1,c-CS-m
2,c-CS-m
3,c-CS-m
4,c-CS-m
...,...
795,t-SC-s
796,t-SC-s
797,t-SC-s
798,t-SC-s


In [86]:
result = pd.concat([df, Y], axis=1, join='inner')

In [87]:
result

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Class
0,0.503644,0.747193,0.430175,2.816329,5.990152,0.218830,0.177565,2.373744,0.232224,1.750936,...,0.122652,0.129468,0.106305,0.108336,0.427099,0.114783,0.131790,0.128186,1.675652,c-CS-m
1,0.514617,0.689064,0.411770,2.789514,5.685038,0.211636,0.172817,2.292150,0.226972,1.596377,...,0.116682,0.129468,0.106592,0.104315,0.441581,0.111974,0.135103,0.131119,1.743610,c-CS-m
2,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,1.561316,...,0.118508,0.129468,0.108303,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,c-CS-m
3,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,1.595086,...,0.132781,0.129468,0.103184,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,c-CS-m
4,0.434940,0.617430,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,1.504230,...,0.129954,0.129468,0.104784,0.110694,0.434154,0.118481,0.140314,0.148380,1.839730,c-CS-m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.391371,0.621326,0.315093,2.100006,3.279238,0.235904,0.200477,3.195154,0.222292,1.219100,...,0.157950,0.111824,0.119654,0.115069,0.439544,0.150943,0.148158,0.192194,1.478935,t-SC-s
796,0.239972,0.419969,0.208695,1.515668,2.003866,0.167836,0.137255,1.774287,0.158244,0.790943,...,0.192606,0.132316,0.109677,0.126086,0.327594,0.196506,0.196010,0.211881,1.019969,t-SC-s
797,0.365591,0.572986,0.297292,2.005589,2.791422,0.244081,0.189986,2.949005,0.217188,1.172087,...,0.158657,0.136461,0.120638,0.121115,0.458330,0.147485,0.141477,0.212982,1.535949,t-SC-s
798,0.444061,0.567609,0.371142,2.231136,3.347213,0.264013,0.212638,5.416038,0.234061,1.292121,...,0.176097,0.147626,0.115801,0.149439,0.470907,0.170620,0.180922,0.220740,1.266460,t-SC-s


In [88]:
result.to_csv("/home/miki/Desktop/Docker/iNeuron/mice-protien-expression/data/final/data.csv")

In [89]:
df_trisomy = result[result['Class'].str.startswith('t')]

In [90]:
df_control = result[result['Class'].str.startswith('c')]

In [91]:
df_trisomy.head()

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,BAD_N,BCL2_N,pS6_N,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Class
310,0.731738,0.944241,0.324365,2.320243,3.62125,0.198649,0.16224,2.257041,0.203458,1.758415,...,0.115779,0.129468,0.104487,0.126523,0.3884,0.117319,0.115926,0.133304,1.795498,t-CS-m
311,0.668978,0.903673,0.312435,2.291341,3.594778,0.20711,0.169937,2.179377,0.198259,1.666027,...,0.115131,0.129468,0.101062,0.126523,0.370562,0.113982,0.106135,0.136185,1.811944,t-CS-m
312,0.776663,0.916962,0.327057,2.407933,3.627103,0.209446,0.173021,2.271338,0.210681,1.785769,...,0.112108,0.129468,0.097347,0.126523,0.360733,0.123203,0.118572,0.142981,1.750796,t-CS-m
313,0.646086,0.942658,0.313107,2.259709,3.69463,0.217536,0.170813,2.274727,0.211013,1.636984,...,0.118344,0.129468,0.099226,0.126523,0.34729,0.106586,0.11041,0.136889,1.739987,t-CS-m
314,0.60282,0.835561,0.301044,2.133126,3.064457,0.215345,0.184765,1.985534,0.204908,1.703168,...,0.138234,0.129468,0.103774,0.126523,0.367925,0.123559,0.141378,0.154219,1.664832,t-CS-m


In [92]:
df_trisomy.to_csv('/home/miki/Desktop/Docker/iNeuron/mice-protien-expression/data/final/trisomy.csv')

In [93]:
df_control.to_csv('/home/miki/Desktop/Docker/iNeuron/mice-protien-expression/data/final/control.csv')