# Prediction on Dropseq Dataset

## Imports

In [1]:
#Importing libraries
from Utils.data_analysis import *
from Utils.processing import *
from Utils.dim_reduction import *
from Utils.modeling import *

from sklearn.model_selection import train_test_split
import warnings, csv
warnings.simplefilter("ignore")
%matplotlib inline     
sns.set_theme(color_codes=True)

processing.py loaded
data_analysis.py loaded
dim_reduction.py loaded
modeling.py loaded successfully


In [2]:
df_HCC_model = pd.read_csv("Data/dropseq_data/HCC1806_Filtered_Normalised_3000_Data_train.txt", delimiter="\ ",engine='python',index_col=0, quoting=csv.QUOTE_NONE)
df_MCF_model = pd.read_csv("Data/dropseq_data/MCF7_Filtered_Normalised_3000_Data_train.txt", delimiter="\ ",engine='python',index_col=0, quoting=csv.QUOTE_NONE)

y_HCC = get_oxia(df_HCC_model.T)
X_HCC = df_HCC_model.T

y_MCF = get_oxia(df_MCF_model.T)
X_MCF = df_MCF_model.T

X_HCC_tr, X_HCC_ts, y_HCC_tr, y_HCC_ts = train_test_split(X_HCC, y_HCC, test_size=0.2, random_state=42)
X_MCF_tr, X_MCF_ts, y_MCF_tr, y_MCF_ts = train_test_split(X_MCF, y_MCF, test_size=0.2, random_state=42)

## Supervised Dimensionality Reduction

In [None]:
X_HCC_pca_supervised, HCC_pca = supervised_pca(X_HCC_tr, y_HCC_tr)
plot_2d(X_HCC_pca_supervised, y_HCC_tr, 'HCC1806', 'Supervised PCA')

X_MCF_pca_supervised, MCF_pca = supervised_pca(X_MCF_tr, y_MCF_tr)
plot_2d(X_MCF_pca_supervised, y_MCF_tr, 'MCF7', 'Supervised PCA')

y_HCC_pca = dim_red_predictor(X_HCC_pca_supervised[['X1', 'X2']], y_HCC_tr, X_HCC_ts, HCC_pca)
y_MCF_pca = dim_red_predictor(X_MCF_pca_supervised[['X1', 'X2']], y_MCF_tr, X_MCF_ts, MCF_pca)

print("Accuracy of HCC1806 PCA: ", accuracy_score(y_HCC_ts, y_HCC_pca))
print("Accuracy of MCF7 PCA: ", accuracy_score(y_MCF_ts, y_MCF_pca))

In [None]:
X_HCC_tsne_supervised, HCC_tsne = supervised_tsne(X_HCC_tr, y_HCC_tr)
plot_2d(X_HCC_tsne_supervised, y_HCC_tr, 'HCC1806', 'Supervised t-SNE')

X_MCF_tsne_supervised, MCF_tsne = supervised_tsne(X_MCF_tr, y_MCF_tr)
plot_2d(X_MCF_tsne_supervised, y_MCF_tr, 'MCF7', 'Supervised t-SNE')

y_HCC_tsne = dim_red_predictor(X_HCC_tsne_supervised[['X1', 'X2']], y_HCC_tr, X_HCC_ts, HCC_tsne)
y_MCF_tsne = dim_red_predictor(X_MCF_tsne_supervised[['X1', 'X2']], y_MCF_tr, X_MCF_ts, MCF_tsne)

print("Accuracy of HCC1806 t-SNE: ", accuracy_score(y_HCC_ts, y_HCC_tsne))
print("Accuracy of MCF7 t-SNE: ", accuracy_score(y_MCF_ts, y_MCF_tsne))

In [None]:
X_HCC_umap_supervised, HCC_umap = supervised_umap(X_HCC_tr, y_HCC_tr)
plot_2d(X_HCC_umap_supervised, y_HCC_tr, 'HCC1806', 'Supervised UMAP')

X_MCF_umap_supervised, MCF_umap = supervised_umap(X_MCF_tr, y_MCF_tr)
plot_2d(X_MCF_umap_supervised, y_MCF_tr, 'MCF7', 'Supervised UMAP')

y_HCC_umap = dim_red_predictor(X_HCC_umap_supervised[['X1', 'X2']], y_HCC_tr, X_HCC_ts, HCC_umap)
y_MCF_umap = dim_red_predictor(X_MCF_umap_supervised[['X1', 'X2']], y_MCF_tr, X_MCF_ts, MCF_umap)

print("Accuracy of HCC1806 UMAP: ", accuracy_score(y_HCC_ts, y_HCC_umap))
print("Accuracy of MCF7 UMAP: ", accuracy_score(y_MCF_ts, y_MCF_umap))

## Clustering

In [None]:
kmeans_HCC = KMeans(n_clusters=2, random_state=1).fit(X_HCC)
kmeans_HCC_score = clustering_plot(X_HCC, y_HCC, kmeans_HCC.labels_, 'HCC1806: K-means clustering')

kmeans_MCF = KMeans(n_clusters=2, random_state=42).fit(X_MCF)
kmeans_MCF_score = clustering_plot(X_MCF, y_MCF, kmeans_MCF.labels_, 'MCF7: K-means clustering')

In [None]:
gmm_HCC = GaussianMixture(n_components=2, random_state=1).fit(X_HCC)
gmm_HCC_score = clustering_plot(X_HCC, y_HCC, gmm_HCC.predict(X_HCC), 'HCC1806: Gaussian Mixture Model clustering')

gmm_MCF = GaussianMixture(n_components=2).fit(X_MCF)
gmm_MCF_score = clustering_plot(X_MCF, y_MCF, gmm_MCF.predict(X_MCF), 'MCF7: Gaussian Mixture Model clustering')

## Logistic Regression