# Multi-Omic Factor Analysis (MOFA) of Pancreatic Cancer Cell Lines

## Overview
This notebook performs Multi-Omic Factor Analysis (MOFA) on pancreatic cancer cell lines from the Cancer Cell Line Encyclopedia (CCLE). The analysis integrates transcriptomic (RNA) and proteomic data to identify latent factors that explain variance across different molecular layers.

## Objectives
- Integrate RNA expression and protein abundance data
- Identify latent factors driving multi-omic variation
- Performs group-based analysis (high vs. low expression) and ungrouped analysis
- Generate MOFA models for downstream interpretation


## Installation and Library Loading

Installing the MOFA Python package and importing required libraries for data manipulation and analysis.

In [None]:
!pip install mofapy2
from mofapy2.run.entry_point import entry_point
import pandas as pd
import io

# Load RNA expression and protein abundance data and sort dataframes

In [None]:
RNA = pd.read_csv("/content/pancreas_CCLE_RNA_profile.csv")
Protein = pd.read_csv("/content/pancreas_CCLE_protein_profile.csv")
RNA.sort_values(by="depmap_id", inplace=True)
Protein.sort_values(by="Unnamed: 0",inplace=True)

## Data preprocessing : removing non-informative data

In [None]:
#Store sample information columns separately
columns_info_samples = RNA[["Unnamed: 0","depmap_id"]]
RNA.drop(columns=["Unnamed: 0","depmap_id"], inplace=True)

# Calculate variance for each gene and identify zero-variance genes
variances = RNA.var()

# Remove non-informative genes (zero variance) from RNA data
zero_variance_cols = variances[variances == 0].index.tolist()
RNA = RNA.drop(columns=zero_variance_cols)
RNA = pd.concat([RNA, columns_info_samples], axis=1)

## Data preprocessing : removing non-informative data

In [None]:
#Store sample information columns separately
Protein_info_samples = Protein[["Unnamed: 0.1","Unnamed: 0"]]
Protein.drop(columns=["Unnamed: 0.1","Unnamed: 0"], inplace=True)

#Remove proteins with all missing values across samples
Protein = Protein.dropna(axis=1, how='all')
Protein = pd.concat([Protein, Protein_info_samples], axis=1)

## Group stratification

In [None]:
#Define gene of interest and expression threshold
gene="Gene of Interest"
TPM_cutoff= "Cutoff Value"

#Select high expressing cell lines
high_expression = RNA[RNA[gene]>TPM_cutoff]
high_sample_name = high_expression["depmap_id"]
high_expression.drop(columns=["Unnamed: 0","depmap_id"], inplace=True)
RNA_features = high_expression.columns


#Select low expressing cell lines
low_expression = RNA[RNA[gene]>TPM_cutoff]
low_sample_name = low_expression["depmap_id"]
low_expression.drop(columns=["Unnamed: 0","depmap_id"], inplace=True)


In [None]:
#Extract protein data for high expression group
Pro_high_expression = Protein.loc[Protein["Unnamed: 0"].isin(high_sample_name)]
Pro_high_expression.drop(columns=["Unnamed: 0.1","Unnamed: 0"], inplace=True)
Pro_features = Pro_high_expression.columns


#Extract protein data for low expression group
Pro_low_expression = Protein.loc[Protein["Unnamed: 0"].isin(low_sample_name)]
Pro_low_expression.drop(columns=["Unnamed: 0.1","Unnamed: 0"], inplace=True)


# Convert RNA and protein dataframes to numpy matrices

In [21]:
#create matrices
RNA_Matrix_high = high_expression.to_numpy()
RNA_Matrix_low = low_expression.to_numpy()
Protein_Matrix_high = Pro_high_expression.to_numpy()
Protein_Matrix_low = Pro_low_expression.to_numpy()

# Define estructure for MOFA

In [None]:
# Define views and groups for MOFA
views = ["RNA","Protein"]
data = [None]*len(views)
groups=["high_expression","low_expression"]

# Initialize and populate nested list structure for data matrices (views x groups)
data[0]=[None]* len(groups)
data[1]=[None]* len(groups)
data[0][0] = RNA_Matrix_high
data[0][1] = RNA_Matrix_low
data[1][0] = Protein_Matrix_high
data[1][1] = Protein_Matrix_low

#Create nested list of sample names for each group
samples_names = [None]*len(groups)
samples_names[0] = high_sample_name
samples_names[1] = low_sample_name

#Create nested list of feature names for each view
features = [None]*len(views)
features[0] = RNA_features
features[1] = Pro_features

# Initialize MOFA entry point

In [None]:
ent = entry_point()
# Configure data will not be scaled
ent.set_data_options(
    scale_views = False
)

# Load data matrices into MOFA with metadata

In [None]:
ent.set_data_matrix(data,
views_names = views,
	groups_names = groups,
	samples_names = samples_names,
	features_names = features,
  likelihoods = ["gaussian","gaussian"]
                    )

# Configure model options

In [None]:
ent.set_model_options(
    factors = 4,
    spikeslab_weights = True,
    ard_weights = True
)

# Configure training options

In [None]:
ent.set_train_options(
    convergence_mode = "medium",
    dropR2 = 0.001,
    gpu_mode = False,
    seed = 1
)

# Build and run the MOFA model

In [None]:
ent.build()
ent.run()
ent.save("/content/TEST.hdf5", save_data=True)

#  Alternative no groups approach 

In [None]:
************************************************************************************************************************************
# NOTA: if the preprocessing has not been run on the input data please go to the preprocessing of data section in this script     
************************************************************************************************************************************

# Define estructure for MOFA

In [None]:
# Remove sample identifier columns from RNA and protein data
RNA.drop(columns=["Unnamed: 0","depmap_id"], inplace=True)
Protein.drop(columns=["Unnamed: 0.1","Unnamed: 0"], inplace=True)


# Convert dataframes to numpy matrices
RNA_Matrix = RNA.to_numpy()
Protein_Matrix = Protein.to_numpy()


# Create nested list structure for single-group analysis
views = ["RNA","Protein"]
data = [None]*len(views)

# Populate data structure (single group, multiple views)
data[0]=[None]* 1
data[1]=[None]* 1
data[0][0] = RNA
data[1][0] = Protein

# Initialize MOFA entry point

In [None]:
ent = entry_point()
# Configure data will not be scaled
ent.set_data_options(
    scale_views = False
)

# Create sample names list for single-group analysis

In [26]:
samples_names_no_groups = [None]* 1
samples_names_no_groups[0]= columns_info_samples['depmap_id']

# Load data matrices into MOFA (single group, multiple views)

In [None]:
ent.set_data_matrix(data,
views_names = views,
features_names = features,
samples_names = samples_names_no_groups,
likelihoods = ["gaussian","gaussian"]
                    )

# Configure model options

In [None]:
ent.set_model_options(
    factors = 4,
    spikeslab_weights = True,
    ard_weights = True
)

# Configure training options

In [None]:
ent.set_train_options(
    convergence_mode = "medium",
    dropR2 = 0.001,
    gpu_mode = False,
    seed = 1
)

# Build and run the MOFA model

In [None]:
ent.build()
ent.run()
ent.save("/content/CLLE_no_groups.hdf5", save_data=True)