## Dataset initialization

In [1]:
import pandas as pd

#import data sets in meth data
df0=pd.read_csv('KIRC_HumanMethylation450.data',sep='\t').set_index('sample').T
df1=pd.read_csv('KIRP_HumanMethylation450.data',sep='\t').set_index('sample').T
df2=pd.read_csv('KICH_HumanMethylation450.data',sep='\t').set_index('sample').T

In [2]:
df0.shape, df1.shape, df2.shape

((480, 485578), (321, 485578), (66, 485578))

## Preprocessing

### 1. Data Integration

In [3]:
df=pd.concat([df0, df1, df2])

In [4]:
df.shape

(867, 485578)


### 2. Data Cleaning

In [5]:
# number of duplicates
# df.duplicated().sum()

In [6]:
df.isnull().sum()

sample
cg13332474    0
cg00651829    0
cg17027195    0
cg09868354    0
cg03050183    2
             ..
cg10230711    0
cg16651827    0
cg18138552    0
cg07883722    0
Subtype       0
Length: 485578, dtype: int64

In [7]:
# Count the null values containing features
n=(df.isnull().sum()>0).sum()
n

110192

In [8]:
# Check the null values
df.isnull().sum().sort_values(ascending = False).head(n)

sample
cg20383654    867
cg01757760    867
cg23688350    867
cg24536120    867
cg01763947    867
             ... 
cg14467794      1
cg17784749      1
cg21821726      1
cg14872074      1
cg12753851      1
Length: 110192, dtype: int64

In [9]:
# drop the null values
df=df.dropna(axis=1)

In [10]:
df.shape

(867, 375386)

In [11]:
print('Count of zeros')
for column_name in df.columns:
    column = df[column_name]
    # Get the count of Zeros in column 
    count = (column == 0).sum()
    if(count>0):
        print('\t', column_name, ' : ', count)

Count of zeros
	 Subtype  :  480


### 3. Feature Selection - Mutual Information

In [23]:
X=df.drop(['Subtype'], axis=1)

In [24]:
y=df['Subtype']

In [15]:
# import necessary libraries
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

fs = SelectKBest(score_func=mutual_info_classif, k=200)

X_selected = fs.fit_transform(X, y)

In [16]:
X_selected

array([[0.7114, 0.8402, 0.4888, ..., 0.7253, 0.7067, 0.6826],
       [0.6274, 0.4223, 0.6813, ..., 0.4296, 0.4939, 0.5001],
       [0.7269, 0.1851, 0.8087, ..., 0.5529, 0.6474, 0.5274],
       ...,
       [0.2831, 0.099 , 0.1407, ..., 0.2474, 0.0536, 0.2142],
       [0.3853, 0.1738, 0.2989, ..., 0.4532, 0.1122, 0.3444],
       [0.2216, 0.0568, 0.2301, ..., 0.2281, 0.0539, 0.2011]])

In [75]:
# extract columns
cols = [words[:-1] for words in fs.get_feature_names_out()]
# print(cols)

# add columns
X_selected = pd.DataFrame(X_selected, columns=cols)

fs.fit(X_selected,y)

SelectKBest(k=200, score_func=<function mutual_info_classif at 0x7f999254ed30>)

In [76]:
# view columns
X_selected.columns

Index(['cg133332', 'cg244856', 'cg014432', 'cg262806', 'cg145572', 'cg238561',
       'cg009859', 'cg162095', 'cg003188', 'cg134586',
       ...
       'cg218153', 'cg276524', 'cg182419', 'cg162484', 'cg243051', 'cg113405',
       'cg262427', 'cg080173', 'cg026701', 'cg189711'],
      dtype='object', length=200)

In [17]:
# view initial data records
X_selected.sample(6)

Unnamed: 0,cg1333326,cg2448569,cg0144328,cg2628069,cg1455721,cg2385613,cg0098598,cg1620951,cg0031889,cg1345860,...,cg2181533,cg2765249,cg1824196,cg1624843,cg2430515,cg1134053,cg2624275,cg0801732,cg0267012,cg1897117
387,0.7353,0.257,0.2541,0.5935,0.7758,0.1279,0.5843,0.8192,0.5449,0.7335,...,0.547,0.7547,0.0202,0.6369,0.3306,0.6317,0.7567,0.4299,0.8031,0.7806
787,0.8498,0.9071,0.9292,0.912,0.8821,0.6992,0.7176,0.9481,0.829,0.9128,...,0.8993,0.916,0.8797,0.8585,0.8213,0.9515,0.9053,0.9256,0.9678,0.9806
137,0.7221,0.123,0.779,0.8009,0.7303,0.0598,0.6685,0.3119,0.455,0.5265,...,0.3898,0.7034,0.4351,0.7109,0.5597,0.3748,0.7266,0.5778,0.6013,0.4699
791,0.7337,0.8348,0.8148,0.7263,0.756,0.5129,0.7959,0.9036,0.542,0.8112,...,0.892,0.7866,0.3966,0.6931,0.6274,0.8839,0.816,0.8684,0.8899,0.8166
186,0.7157,0.7525,0.4968,0.7795,0.49,0.2988,0.6773,0.8308,0.5448,0.7664,...,0.832,0.8099,0.049,0.6858,0.4686,0.8403,0.5328,0.6482,0.8385,0.661
753,0.8662,0.9269,0.9327,0.9105,0.8838,0.7752,0.8746,0.9508,0.8429,0.9558,...,0.9025,0.9421,0.859,0.8675,0.8216,0.958,0.9211,0.9446,0.9621,0.948


In [21]:
X_selected_backup=X_selected

In [26]:
X_selected['Subtype']=y.values

In [31]:
X_selected_backup=X_selected

In [38]:
X_selected.index=df.index

In [40]:
# store best 100 features in a seperate file
X_selected.to_csv("./HumanMethylation450_selected_200.csv", encoding='utf-8', index=True)