In [55]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import LeaveOneOut
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from numpy import mean
from numpy import std
cv = LeaveOneOut()

### Reading in the data

In [2]:
dfTrain = pd.read_csv("PanCancer_Train.csv")

In [3]:
dfTrain.head()

Unnamed: 0,Type,ENSG00000242268.2,ENSG00000270112.3,ENSG00000167578.15,ENSG00000273842.1,ENSG00000078237.5,ENSG00000146083.10,ENSG00000225275.4,ENSG00000158486.12,ENSG00000198242.12,...,ENSG00000238244.3,ENSG00000186115.11,ENSG00000216352.1,ENSG00000267117.1,ENSG00000273233.1,ENSG00000105063.17,ENSG00000231119.2,ENSG00000280861.1,ENSG00000123685.7,ENSG00000181518.3
0,BRCA,0.0,0.003233,2.697767,0.0,4.903484,9.750896,0.0,0.014376,72.009991,...,0,0.063184,0.0,0.0,0.0,16.606613,0.082307,0.0,0.170743,0.0
1,BRCA,0.0,0.003404,3.739011,0.0,2.789155,15.336983,0.0,0.050073,143.640545,...,0,0.027722,0.0,0.276519,0.040894,12.879618,0.12519,0.0,0.488011,0.0
2,BRCA,0.027791,0.0,2.936954,0.0,5.03247,16.698753,0.0,0.015828,191.154543,...,0,0.326559,0.0,0.044743,0.0,14.225543,0.207243,0.0,1.629158,0.0
3,BRCA,0.0,0.003093,4.916624,0.0,2.371734,15.765749,0.0,0.038087,161.635422,...,0,0.055411,0.0,0.107668,0.0,12.966195,0.297472,0.0,0.717565,0.011891
4,BRCA,0.068965,0.007177,5.276458,0.0,3.397425,16.484607,0.0,0.009819,101.657934,...,0,0.0,0.0,0.055516,0.143677,20.231642,0.169172,0.0,0.2617,0.0


### EDA

In [4]:
dfTrain.columns

Index(['Type', 'ENSG00000242268.2', 'ENSG00000270112.3', 'ENSG00000167578.15',
       'ENSG00000273842.1', 'ENSG00000078237.5', 'ENSG00000146083.10',
       'ENSG00000225275.4', 'ENSG00000158486.12', 'ENSG00000198242.12',
       ...
       'ENSG00000238244.3', 'ENSG00000186115.11', 'ENSG00000216352.1',
       'ENSG00000267117.1', 'ENSG00000273233.1', 'ENSG00000105063.17',
       'ENSG00000231119.2', 'ENSG00000280861.1', 'ENSG00000123685.7',
       'ENSG00000181518.3'],
      dtype='object', length=60484)

In [5]:
dfTrain.shape

(540, 60484)

In [6]:
dfTrain.dtypes

Type                   object
ENSG00000242268.2     float64
ENSG00000270112.3     float64
ENSG00000167578.15    float64
ENSG00000273842.1     float64
                       ...   
ENSG00000105063.17    float64
ENSG00000231119.2     float64
ENSG00000280861.1     float64
ENSG00000123685.7     float64
ENSG00000181518.3     float64
Length: 60484, dtype: object

In [7]:
dfTrain['Type'].unique()

array(['BRCA', 'CPTAC', 'KIRC', 'LUAD', 'THCA', 'UCEC', 'LUSC', 'HNSC',
       'PRAD'], dtype=object)

In [8]:
pd.get_dummies(dfTrain['Type'])

Unnamed: 0,BRCA,CPTAC,HNSC,KIRC,LUAD,LUSC,PRAD,THCA,UCEC
0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
535,0,0,0,0,0,0,1,0,0
536,0,0,0,0,0,0,1,0,0
537,0,0,0,0,0,0,1,0,0
538,0,0,0,0,0,0,1,0,0


### Editing Target Column

In [9]:
dfTrain['Type'] = dfTrain['Type'].replace('BRCA', 1)
dfTrain['Type'] = dfTrain['Type'].replace('CPTAC',2)
dfTrain['Type'] = dfTrain['Type'].replace('HNSC',3)
dfTrain['Type'] = dfTrain['Type'].replace('KIRC',4)
dfTrain['Type'] = dfTrain['Type'].replace('LUAD',5)
dfTrain['Type'] = dfTrain['Type'].replace('LUSC',6)
dfTrain['Type'] = dfTrain['Type'].replace('PRAD',7)
dfTrain['Type'] = dfTrain['Type'].replace('THCA',8)
dfTrain['Type'] = dfTrain['Type'].replace('UCEC',9)
dfTrain['Type'].unique()

array([1, 2, 4, 5, 8, 9, 6, 3, 7])

### Dropping Columns

In [35]:
### drop columns
# Set the threshold for the number of zeros
threshold = 50
# Count the number of zeros in each column
zeros_count = (dfTrain == 0).sum()

# Filter out columns with zeros greater than or equal to the threshold
columns_to_keep = zeros_count[zeros_count < threshold].index
dfTrain = pd.DataFrame(dfTrain[columns_to_keep])
dfTrain

Unnamed: 0,Type,ENSG00000167578.15,ENSG00000078237.5,ENSG00000146083.10,ENSG00000158486.12,ENSG00000198242.12,ENSG00000259883.1,ENSG00000134108.11,ENSG00000263089.1,ENSG00000172137.17,...,ENSG00000146587.16,ENSG00000173930.8,ENSG00000107863.15,ENSG00000213782.6,ENSG00000146707.13,ENSG00000135094.9,ENSG00000009694.12,ENSG00000105063.17,ENSG00000231119.2,ENSG00000123685.7
0,1,2.697767,4.903484,9.750896,0.014376,72.009991,0.085830,52.731004,0.046374,2.059473,...,4.033736,0.065133,7.380174,0.236340,6.542841,4.490928,0.030789,16.606613,0.082307,0.170743
1,1,3.739011,2.789155,15.336983,0.050073,143.640545,0.158165,37.671767,0.065109,1.329824,...,3.394032,0.124107,5.635569,0.632537,6.964534,3.164238,0.040526,12.879618,0.125190,0.488011
2,1,2.936954,5.032470,16.698753,0.015828,191.154543,0.665402,29.928651,0.027655,5.480390,...,2.701772,0.107278,4.485938,0.606829,5.195348,11.959812,0.088744,14.225543,0.207243,1.629158
3,1,4.916624,2.371734,15.765749,0.038087,161.635422,0.225811,32.331711,0.059154,0.604094,...,3.465626,0.106821,5.251576,0.480471,4.873200,1.595363,0.057683,12.966195,0.297472,0.717565
4,1,5.276458,3.397425,16.484607,0.009819,101.657934,0.063508,40.698078,0.020016,1.919211,...,3.554538,0.029834,6.273163,0.264742,1.611557,3.575458,0.018035,20.231642,0.169172,0.261700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,7,2.128338,1.565206,13.308721,0.010351,176.646677,0.089265,25.361359,0.064306,0.050516,...,3.577143,0.000000,3.604479,0.525735,2.810634,0.726929,1.272830,11.928856,0.323383,0.329785
536,7,1.103009,2.462574,8.837613,0.054563,131.227182,0.000000,22.853671,0.022881,0.843589,...,4.147016,0.000000,4.268006,0.181393,13.755055,1.661863,0.792468,7.350502,0.198539,0.373082
537,7,5.936175,1.359393,13.251654,0.016818,175.340011,0.090647,22.627778,0.065301,0.418592,...,2.807591,0.002620,3.495427,0.346669,4.048352,1.208983,2.355269,10.126033,0.811310,0.324584
538,7,2.028798,2.238101,11.950354,0.015075,206.885227,0.048752,21.495184,0.079021,0.364179,...,3.248417,0.010570,5.067677,0.331875,2.484300,0.718992,0.236089,10.748713,0.280505,0.443349


### Splitting the Data and Normlization

In [36]:
y= dfTrain["Type"]
# y
X= dfTrain.iloc[:,1:]
# X

In [37]:
# Z-score normalization (standardization)
normalized_X = (X - X.mean()) / X.std()
normalized_X.head()

Unnamed: 0,ENSG00000167578.15,ENSG00000078237.5,ENSG00000146083.10,ENSG00000158486.12,ENSG00000198242.12,ENSG00000259883.1,ENSG00000134108.11,ENSG00000263089.1,ENSG00000172137.17,ENSG00000167700.7,...,ENSG00000146587.16,ENSG00000173930.8,ENSG00000107863.15,ENSG00000213782.6,ENSG00000146707.13,ENSG00000135094.9,ENSG00000009694.12,ENSG00000105063.17,ENSG00000231119.2,ENSG00000123685.7
0,-0.401494,0.476532,-0.163051,-0.273824,-0.979357,-0.343433,1.964887,-0.333111,-0.0769,0.004062,...,0.080942,-0.45856,0.024489,-0.606802,0.73597,0.464803,-0.43399,0.425802,-0.59027,-0.737756
1,0.101306,-0.533404,0.918369,-0.250538,-0.126508,-0.263174,0.691719,-0.248605,-0.131373,-0.207575,...,-0.217004,-0.452364,-0.250659,0.024499,0.890448,0.084412,-0.431922,-0.219074,-0.559501,-0.459638
2,-0.285995,0.538144,1.181996,-0.272877,0.439204,0.299635,0.037084,-0.417543,0.178495,-0.166694,...,-0.539428,-0.454132,-0.431972,-0.016464,0.242344,2.606298,-0.42168,0.013809,-0.500626,0.540699
3,0.669955,-0.73279,1.001374,-0.258356,0.087743,-0.188117,0.240249,-0.275466,-0.185554,0.690521,...,-0.183659,-0.45418,-0.31122,-0.217803,0.124332,-0.365419,-0.428278,-0.204094,-0.435884,-0.258409
4,0.843713,-0.242856,1.140539,-0.276796,-0.626362,-0.3682,0.947575,-0.451999,-0.087371,-0.285482,...,-0.142247,-0.462268,-0.150102,-0.561546,-1.070503,0.202318,-0.436699,1.053035,-0.527943,-0.658023


In [38]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=13)

In [39]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train,y_train)
# ?scaler.fit_transform

In [40]:
# X_test = scaler.fit(X_test)
X_train = pd.DataFrame(X_train)

### Logistic Models

In [57]:
model = LogisticRegression(penalty='l2', C=1, solver='liblinear', max_iter=100)
model.fit(X_train,y_train)

In [58]:
y_pred = model.predict(X_test)
# X_test
# # Define the pipeline steps
# steps = [
#     ('scaler', StandardScaler()),  # Scaling step
#     ('classifier', LogisticRegression())  # Classifier step
# ]

# # Create the pipeline
# pipeline = Pipeline(steps)

# # Fit the pipeline on your training data
# pipeline.fit(X_train, y_train)

# # Make predictions on the test data using the pipeline
# y_pred = pipeline.predict(X_test)



In [59]:
accuracy_score(y_test,y_pred)

0.75

In [60]:
r2_score(y_test,y_pred)

0.6278238672622845

In [49]:
# # evaluate model
# scores = cross_val_score(model,X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
# # report performance
# print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
# print(scores)

With Logistic regression, I was able to get an accuracy of 80.56% with an r2 score of 72.3%.
This shows that we have some promising model.

In [None]:
# scores = cross_val_score(model,X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
# # report performance
# print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
# print(scores)

### LDA

In [None]:
lda = LinearDiscriminantAnalysis()
X_train = lda.fit_transform(X_train,y_train)
# X_test = lda.fit(X_test, y_test)

In [None]:
X_test = lda.transform(X_test)

In [None]:
# y_pred = lda.predict(X_test)

### Vector

In [50]:
svm = SVC(C=1, kernel='linear')

In [51]:
svm.fit(X_train,y_train)

In [52]:
y_pred = svm.predict(X_test)




In [53]:
accuracy_score(y_test,y_pred)

0.7129629629629629

In [54]:
confusion_matrix(y_test,y_pred)

array([[ 6,  0,  0,  4,  0,  0,  0,  0,  0],
       [ 0, 14,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 11,  1,  0,  2,  0,  0,  0],
       [ 0,  0,  0, 10,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  5,  8,  1,  0,  0,  0],
       [ 0,  0,  1,  4,  0,  5,  0,  0,  2],
       [ 0,  0,  0,  4,  0,  0, 10,  0,  0],
       [ 0,  0,  0,  1,  0,  0,  0,  9,  0],
       [ 0,  0,  0,  5,  0,  0,  0,  0,  4]])

In [1]:
# # Calculate permutation importance
# perm_importance = permutation_importance(svm, X_test, y_test, n_repeats=30, random_state=0)

# # Get feature importances and their indices
# feature_importances = perm_importance.importances_mean
# feature_indices = np.argsort(feature_importances)[::-1]

# # Print the top important features
# for idx in feature_indices:
#     print(f"Feature {X.columns[idx]}: Importance {feature_importances[idx]}")

### Random Forest Classifier

In [None]:
# ?RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=13)
rfc.fit(X_train,y_train)

In [None]:
# Define the pipeline steps
steps = [
    ('scaler', StandardScaler()),  # Scaling step
    ('classifier', RandomForestClassifier())  # Classifier step
]

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline on your training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data using the pipeline
y_pred = pipeline.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
# Access the trained classifier (RandomForestClassifier) from the pipeline
classifier = pipeline.named_steps['classifier']

# Get the feature importances from the classifier
feature_importances = classifier.feature_importances_

# Assuming you have a DataFrame 'X_train_df' with column names
# Extract column names and their corresponding importances
feature_importance_dict = dict(zip(X_train.columns, feature_importances))

# Sort the features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print or manipulate the sorted feature importances as needed
for feature, importance in sorted_features:
    if (importance > 0.1):
        print(f"Feature: {feature}, Importance: {importance:.4f}")

### Gradient Boosting Classifier

In [None]:
gbc = GradientBoostingClassifier(n_estimators=100)
gbc.fit(X_train,y_train)

In [None]:
y_pred = gbc.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

### Decision Tree model?