In [None]:
#installing MOFA and calling libraries
!pip install mofapy2
from mofapy2.run.entry_point import entry_point
import pandas as pd
import io

In [33]:
#load files and make them to follow the same row order
RNA = pd.read_csv("/content/pancreas_CCLE_RNA_profile.csv")
Protein = pd.read_csv("/content/pancreas_CCLE_protein_profile.csv")
RNA.sort_values(by="depmap_id", inplace=True)
Protein.sort_values(by="Unnamed: 0",inplace=True)

In [16]:
#Pre-process of data
#remove genes with zero variance
columns_info_samples = RNA[["Unnamed: 0","depmap_id"]]
RNA.drop(columns=["Unnamed: 0","depmap_id"], inplace=True)
# Calculate variance for each column
variances = RNA.var()
# Identify columns with zero variance
zero_variance_cols = variances[variances == 0].index.tolist()
RNA = RNA.drop(columns=zero_variance_cols)
RNA = pd.concat([RNA, columns_info_samples], axis=1)

In [17]:
#Pre-process of data
#remove proteins with full missing values
Protein_info_samples = Protein[["Unnamed: 0.1","Unnamed: 0"]]
Protein.drop(columns=["Unnamed: 0.1","Unnamed: 0"], inplace=True)

#drop proteins with no values
Protein = Protein.dropna(axis=1, how='all')

Protein = pd.concat([Protein, Protein_info_samples], axis=1)

In [None]:
#slect RNA groups base on expression
gene="Gene of Interest"
TPM_cutoff= "Cutoff Value"

#Select high expressers
high_expression = RNA[RNA[gene]>TPM_cutoff]
high_sample_name = high_expression["depmap_id"]
high_expression.drop(columns=["Unnamed: 0","depmap_id"], inplace=True)
RNA_features = high_expression.columns
#
#Select low expressers
low_expression = RNA[RNA[gene]>TPM_cutoff]
low_sample_name = low_expression["depmap_id"]
low_expression.drop(columns=["Unnamed: 0","depmap_id"], inplace=True)


In [20]:
#slection of protein groups base on expression
#
#Select high expressers
Pro_high_expression = Protein.loc[Protein["Unnamed: 0"].isin(high_sample_name)]
Pro_high_expression.drop(columns=["Unnamed: 0.1","Unnamed: 0"], inplace=True)
Pro_features = Pro_high_expression.columns
#
#Select low expressers
Pro_low_expression = Protein.loc[Protein["Unnamed: 0"].isin(low_sample_name)]
Pro_low_expression.drop(columns=["Unnamed: 0.1","Unnamed: 0"], inplace=True)


In [21]:
#create matrices
RNA_Matrix_high = high_expression.to_numpy()
RNA_Matrix_low = low_expression.to_numpy()
Protein_Matrix_high = Pro_high_expression.to_numpy()
Protein_Matrix_low = Pro_low_expression.to_numpy()

In [22]:
#Create nested list of data
views = ["RNA","Protein"]
data = [None]*len(views)
groups=["high_expression","low_expression"]

#Nested list of matrix
data[0]=[None]* len(groups)
data[1]=[None]* len(groups)
data[0][0] = RNA_Matrix_high
data[0][1] = RNA_Matrix_low
data[1][0] = Protein_Matrix_high
data[1][1] = Protein_Matrix_low

#Neste list of groups
samples_names = [None]*len(groups)
samples_names[0] = high_sample_name
samples_names[1] = low_sample_name

#Nested list of features
features = [None]*len(views)
features[0] = RNA_features
features[1] = Pro_features

In [None]:
#Installing MOFA model
ent = entry_point()

In [None]:
#decide if the view must be scaled
ent.set_data_options(
    scale_views = False
)

In [None]:
#set data matrix
ent.set_data_matrix(data,
views_names = views,
	groups_names = groups,
	samples_names = samples_names,
	features_names = features,
  likelihoods = ["gaussian","gaussian"]
                    )

In [None]:
#Set model options
ent.set_model_options(
    factors = 4,
    spikeslab_weights = True,
    ard_weights = True
)

In [None]:
#set train options
ent.set_train_options(
    convergence_mode = "medium",
    dropR2 = 0.001,
    gpu_mode = False,
    seed = 1
)

In [None]:
#run the model
ent.build()

ent.run()

ent.save("/content/TEST.hdf5", save_data=True)

In [23]:
#************************************************************
#               alternative run= no groups
#************************************************************


#NOTA: if the pre processing has not been run on the input data please go to the Pre-process of data section in this script
RNA.drop(columns=["Unnamed: 0","depmap_id"], inplace=True)
Protein.drop(columns=["Unnamed: 0.1","Unnamed: 0"], inplace=True)


#cretae matrix
RNA_Matrix = RNA.to_numpy()
Protein_Matrix = Protein.to_numpy()


#Create nested list of data
views = ["RNA","Protein"]
data = [None]*len(views)

#Nested list of matrix
data[0]=[None]* 1
data[1]=[None]* 1
data[0][0] = RNA
data[1][0] = Protein

In [None]:
#Installing MOFA model
ent = entry_point()

In [25]:
#decide if the view must be scaled
ent.set_data_options(
    scale_views = False
)

In [26]:
samples_names_no_groups = [None]* 1
samples_names_no_groups[0]= columns_info_samples['depmap_id']

In [None]:
#set data matrix
ent.set_data_matrix(data,
views_names = views,
features_names = features,
samples_names = samples_names_no_groups,
likelihoods = ["gaussian","gaussian"]
                    )

In [None]:
#Set model options
ent.set_model_options(
    factors = 4,
    spikeslab_weights = True,
    ard_weights = True
)

#set train options
ent.set_train_options(
    convergence_mode = "medium",
    dropR2 = 0.001,
    gpu_mode = False,
    seed = 1
)

In [None]:
#run the model
ent.build()

ent.run()

ent.save("/content/CLLE_no_groups.hdf5", save_data=True)