In [147]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [148]:
# Read in data
train = pd.read_csv("kidney_train.csv")
test = pd.read_csv("kidney_test.csv")

In [149]:
# Drop the first column from dataframes which are arbitrary
train.drop(['Unnamed: 0'], axis = 1, inplace = True)
test.drop(['Unnamed: 0'], axis = 1, inplace = True)
test.head()

Unnamed: 0,X226496_at,X219563_at,X208383_s_at,X220082_at,X225809_at,X206339_at,X227889_at,X210754_s_at,X232964_at,X223609_at,...,X209987_s_at,X225866_at,X219017_at,X225011_at,X214612_x_at,X213699_s_at,X225155_at,X1560500_at,X212337_at,X223206_s_at
0,663.3,517.1,11077.7,429.9,342.0,386.1,391.4,588.9,1346.1,82.6,...,24.3,1521.9,581.3,1713.9,137.5,17556.8,9184.4,648.6,2215.1,593.8
1,1747.1,343.6,14.8,181.6,432.2,232.0,1347.7,931.0,1693.5,173.9,...,88.1,1793.9,661.1,2142.9,46.3,19301.6,15380.4,1009.2,2670.4,969.4
2,1738.1,5589.2,4701.2,129.3,1105.4,59.8,1007.4,1165.6,1174.5,7.4,...,12.5,1206.1,705.9,4558.1,49.6,16533.5,6656.3,486.2,1156.2,589.3
3,915.7,1272.1,1068.8,276.6,223.2,161.7,469.3,1213.2,1056.1,55.8,...,45.3,1225.4,884.2,3009.8,2480.2,25535.8,5543.2,256.1,2727.3,2123.7
4,1784.6,465.0,356.4,200.8,118.2,136.2,659.0,995.4,1828.2,174.0,...,4.9,1407.0,1617.2,2829.1,19.6,13540.0,19490.8,301.1,2104.6,1338.4


In [150]:
# Convert Tissue column to binary
train["Tissue"] = train["Tissue"].apply(lambda x: 1 if x == "Kidney" else 0)

# Scaling

In [151]:
# Standard scaling using sklearn (In order to scale down the features to a certain range)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train.drop('Tissue',axis=1))
scaled_features = scaler.fit_transform(train.drop('Tissue',axis=1))

train_scaled = pd.DataFrame(scaled_features,columns=train.columns[:-1])
train_scaled.head()

Unnamed: 0,X226496_at,X219563_at,X208383_s_at,X220082_at,X225809_at,X206339_at,X227889_at,X210754_s_at,X232964_at,X223609_at,...,X209987_s_at,X225866_at,X219017_at,X225011_at,X214612_x_at,X213699_s_at,X225155_at,X1560500_at,X212337_at,X223206_s_at
0,-2.311955,0.570401,-0.360641,-0.477985,-0.586033,-0.162167,1.900927,-0.22482,-1.456369,-0.252936,...,-0.37892,-0.890772,0.81242,-0.187542,-0.207168,-1.228387,-0.312708,-1.095564,-0.071645,0.057597
1,-1.589268,-0.71609,-0.263418,-0.429257,-0.20764,-0.121665,-0.200583,1.823334,-0.429771,-0.017315,...,0.270445,-0.411486,-0.26442,-0.932667,-0.228798,0.941127,-0.364276,-0.151492,0.603238,0.41325
2,0.856343,-0.910566,-0.271124,4.837861,1.682188,-0.140809,-0.408987,0.971239,-0.701478,-0.130276,...,-0.407519,1.062259,0.329711,2.921263,-0.250086,0.591574,-0.931114,-0.98894,-0.828754,0.266841
3,1.799742,-0.884409,-0.011917,-0.312511,-0.831881,-0.117053,-0.377324,-0.441309,-0.089803,-0.113745,...,0.91981,0.574965,-0.445127,-1.012806,-0.202578,1.98029,3.498522,0.496382,0.552897,-0.746525
4,0.118812,-0.281125,6.43719,-0.412551,0.698794,8.981795,-0.818888,-0.880131,0.073489,-0.181302,...,-0.498362,-1.018453,-0.26583,1.558394,-0.249891,0.096951,-0.24383,2.911861,-0.503211,-0.571984


I commented out the below section as it was hard to pull the relevant features from a PCA result.

In [152]:
# Implement Principal component analysis to drop down the component size to 10 from 50
# from sklearn.decomposition import PCA
# pca = PCA(n_components=10)
# pca.fit(train_scaled)
# x_pca = pca.transform(train_scaled)
# Check the original shape of the data
# train_scaled.shape

In [153]:
# Print the shape after PCA
# x_pca.shape

In [154]:
# Split the data into training and test sets for evaluation purposes
from sklearn.model_selection import train_test_split
X = train_scaled
y = train["Tissue"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Model Selection
* Decision Tree

In [155]:
# Train a decision tree classification model
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)

In [156]:
# Import evaluation metrics
from sklearn.metrics import classification_report,confusion_matrix

In [157]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.88      0.92      0.90        25
          1       0.50      0.40      0.44         5

avg / total       0.82      0.83      0.83        30



In [158]:
print(confusion_matrix(y_test,predictions))

[[23  2]
 [ 3  2]]


* Random Forest Classifier

In [159]:
# Train a random forest classification model
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(X_train,y_train)
predictions = rfc.predict(X_test)

In [160]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.89      1.00      0.94        25
          1       1.00      0.40      0.57         5

avg / total       0.91      0.90      0.88        30



In [161]:
print(confusion_matrix(y_test,predictions))

[[25  0]
 [ 3  2]]


* Logistic Regression Model

In [162]:
# Train a logistic regression model and 
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

In [163]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.93      1.00      0.96        25
          1       1.00      0.60      0.75         5

avg / total       0.94      0.93      0.93        30



In [164]:
print(confusion_matrix(y_test,predictions))

[[25  0]
 [ 2  3]]


# Feature Selection
* Recursive Feature Elimination

In [165]:
# Recursive Feature Elimination
from sklearn.feature_selection import RFE
rfe = RFE(logmodel, 20)
fit = rfe.fit(X, Y)
selected_features = fit.support_
print(selected_features)

[False  True  True False  True  True False  True  True  True False  True
 False False False False False False False  True  True False False  True
 False False False False False  True  True  True False  True  True False
  True False  True False False False False  True False  True False False
 False False]


In [166]:
# Convert the selected features to the index locations
def convert(ftr):
    list = []
    for i in range(len(ftr)):
        if (ftr[i] == True):
            list.append(i)
    print(list)
print(convert(selected_features))

[1, 2, 4, 5, 7, 8, 9, 11, 19, 20, 23, 29, 30, 31, 33, 34, 36, 38, 43, 45]
None


In [167]:
# Univariate Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = train.drop('Tissue',axis=1)
Y = train["Tissue"]
# feature extraction
test = SelectKBest(score_func=chi2, k=20)
fit = test.fit(X, Y)
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

[3.797e+02 7.856e+01 1.396e+05 1.037e+02 1.702e+03 1.217e+04 5.066e+01
 1.343e+03 7.378e+03 2.079e+03 2.913e+03 4.169e+04 6.017e+02 1.025e+01
 2.941e+03 3.893e+02 2.016e+03 1.973e+01 8.249e+01 6.907e+02 2.121e+03
 3.262e+03 4.230e+01 5.707e+04 1.429e+03 3.586e+03 9.420e+02 3.919e+04
 7.132e+02 1.073e+02 2.172e+04 1.008e+04 5.026e+03 4.058e-02 9.906e+03
 1.539e+03 2.060e+04 2.477e+01 1.886e+04 8.996e+02 2.100e+02 1.965e+02
 2.514e+03 2.913e+03 6.827e+03 2.338e+04 5.478e+00 7.350e+02 4.395e+02
 2.025e+03]
[[2.780e+01 7.100e+00 2.100e+01 1.397e+03 1.851e+03 1.744e+03 2.362e+02
  9.127e+02 9.309e+02 4.217e+03 1.683e+04 3.878e+03 7.290e+03 3.898e+03
  6.190e+01 6.662e+03 1.574e+03 2.541e+03 1.079e+02 1.125e+04]
 [4.492e+02 2.284e+02 7.880e+02 7.704e+02 5.478e+02 1.644e+03 1.400e+03
  5.245e+02 1.034e+03 1.761e+04 1.140e+04 2.053e+03 4.252e+03 2.918e+03
  3.581e+02 7.282e+03 9.633e+02 1.814e+03 6.360e+01 2.399e+04]
 [4.158e+02 1.238e+02 5.850e+02 8.872e+03 2.027e+02 2.527e+03 2.049e+02
  8.1

# Conclusion
 * Here I decided to go with Random Forest Classifier model because it provides the highest accuracy for Kidney tissue which is the critical part of this experiment.
 * Also I decided to keep all the features for this experiment as the sample size is very low.



In [168]:
# Train a random forest classification model
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(X_train,y_train)
predictions = rfc.predict(X_test)
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.89      1.00      0.94        25
          1       1.00      0.40      0.57         5

avg / total       0.91      0.90      0.88        30



In [169]:
print(confusion_matrix(y_test,predictions))

[[25  0]
 [ 3  2]]


In [170]:
predictions

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0])