# Example pipeline for easier usage


### SETUP

In [None]:
# Clone my github repositories for the necessary modules
!git clone https://github.com/Najlaron/PySPRESSO
!git clone https://github.com/Najlaron/pdf_reporter

# Install other necessary modules
%pip install reportlab
%pip install PyPDF2

In [1]:
import PySPRESSO.pmf_01 as pmf
import pdf_reporter.pdf_reporter as pdf_rptr

wf = pmf.Workflow()

### Upload your files into Files (on the left)

## Change things in the following cell according to your data and settings

In [None]:
# You can change the names and paths to the files here, but there are also demo_data

demo_data = 'https://raw.githubusercontent.com/Najlaron/PySPRESSO/main/demo_data.csv'
demo_batch_info = 'https://raw.githubusercontent.com/Najlaron/PySPRESSO/main/demo_data_batch_info.csv'

# Names
name = "Demo_data-POS" # Name of the workflow
main_folder = "Demo_data-POS_analysis" # Name of the main folder
report_file_name = "Demo_data-POS-report" # Name of the report file
output_file_prefix = "Demo_data-POS" # Prefix for the output files
# Choose if you want the report to be for proccessing or statistics (for both use either one - just a visual difference)
report_type = "processing" # or "statistics"


# Paths to the data files UPLOAD YOUR FILES INTO COLAB (ON THE LEFT) OR TRY DEMO ON OUR FILES
data_input_file_name = demo_data # Path to the data file
batch_info_input_file_name = demo_batch_info # Path to the batch info file


# Incorporating the changes
wf.set_name(name)
wf.set_main_folder(main_folder)
wf.set_report_file_name(report_file_name)
wf.initializer_report(report_type = report_type)
wf.initializer_folders()
wf.set_output_file_prefix(output_file_prefix)

## Loading the data

#### Change the parameters if error occurs or you want different settings

In [None]:
# Load data
wf.loader_data(data_input_file_name, separator = ';', encoding = 'UTF-8')

# Add cpdID
wf.add_cpdID(mz_col = 'm/z', rt_col = 'RT [min]')

# Extract variable metadata from the data by choosing specific columns
wf.extracter_variable_metadata(column_index_ranges = [(10, 15), (18, 23)])

# Extract numerical data from the data by choosing columns starting with prefix
wf.extracter_data(prefix = 'Area:')

# Load batch info
wf.loader_batch_info(batch_info_input_file_name)

# Reorder data based on the creation date in the batch info file;
# distinguisher is something in the name of the file after which a name or number of the batch follows
wf.batch_by_name_reorder(distinguisher = None, format='%d.%m.%Y %H:%M')

# Extract metadata from batch info by choosing specific columns
group_columns_to_keep = ['Type', 'Type 2']
# columns such Study File ID, File Name, Creation Date, Sample Type, ... are always kept
wf.extracter_metadata(group_columns_to_keep, prefix = 'Area:')

print()

## Showing all the data matrices

In [None]:
wf.data

In [None]:
wf.metadata

In [None]:
wf.variable_metadata

In [None]:
wf.batch_info

## Setup QC samples and blank samples

### Change the distinguisher (label) or the column names, if necessary

In [None]:
# Define how the QC samples are distinguished from the rest of the samples
qc_samples_distinguisher = 'Quality Control'

# Get a list of the QC samples
QC_samples = wf.metadata[wf.metadata['Sample Type'] == qc_samples_distinguisher]['Sample File'].tolist()
# Save the list of QC samples
wf.set_QC_samples(QC_samples)

# ---------------------------------------------
# Repeat for the blank samples
blank_samples_distinguisher = 'Blank'

# Get a list of the blank samples
blank_samples = wf.metadata[wf.metadata['Sample Type'] == blank_samples_distinguisher]['Sample File'].tolist()
# Save the list of blank samples
wf.set_blank_samples(blank_samples)



### Setup Batch labels, if you have more then one batch

In [None]:
# IF YOUR DATA INCLUDES BATCHES, RUN THIS CELL WITH PROPER CHANGES 
# (if you dont have batches, you can skip this cell, or set it to None by running this line of code: wf.set_batch(None))

# Define the batches
column_name_in_batch_info = 'Batch'

# Get a list of the batches and filter out the None values
batch = wf.batch_info[column_name_in_batch_info].tolist()

# Filter out the None values
batch = [value for value in batch if value is not None]

# Add the batch info to the metadata
wf.metadata['Batch'] = batch

# Save the list of batches
wf.set_batch(batch)

# PROCESSING & ANALYSIS STEPS

#### Following cells represent the order of processing and analysis steps, that we use

#### Feel free to adjust the parameters or delete/drag cells to skip/change order of the steps 

## PROCESSING

In [None]:
# Remove features with missing values above a certain threshold
wf.filter_missing_values(qc_threshold = 0.8, sample_threshold = 0.5)
# Show data
wf.data


In [None]:
# Visualize whole dataset as boxplot
wf.visualizer_boxplot()

In [None]:
# Visualize samples by batches (if you have batches, else will be all in one color)
wf.visualizer_samples_by_batch(show = 'default', cmap = 'viridis')

### Systematic error and Batch effect correction

In [None]:
# Filter features with low intensity based on the blank samples
wf.filter_blank_intensity_ratio(ratio = 20, setting = 'first')
# Show data
wf.data

In [None]:
# Filter out (delete) blank samples (we don't need them anymore)
wf.filter_out_blanks()
# Show data
wf.data

In [None]:
# Perform correction using QC samples for an interpolation
wf.correcter_qc_interpolation(show = 'default', p_values = 'default', use_log = True, use_norm = True, use_zeros = False, cmap  = 'viridis')

### Filter by number of corrected batches

In [None]:
# Filter features with less then 80% correctable batches (batch is not correctable if if has too many QC samples being zeros)
wf.filter_number_of_corrected_batches(0.8) 

### Filter RSD%

In [None]:
# Filter by RSD%
wf.filter_relative_standard_deviation(rsd_threshold = 20, to_plot = False)

### Saving the progress after processing

In [None]:
# Save the datasets 
wf.saver_all_datasets()

## STATISTICAL ANALYSIS

### Correlations

In [None]:
wf.statistics_correlation_means(column_name='Type', cmap = 'coolwarm', min_max=[0.8, 1])

In [None]:
wf.statistics_correlation_means(column_name='Type 2', cmap = 'coolwarm')

### (Cohen's) d-statistics

In [21]:
#TO DO

### PCA

In [None]:
#Perform PCA
wf.statistics_PCA()

#### Scree plot

In [None]:
wf.visualizer_PCA_scree_plot()


#### Loadings

In [None]:
wf.visualizer_PCA_loadings(components=2)

#### Run order plot

In [None]:
wf.visualizer_PCA_run_order(connected = True)

#### PCA grouped visualized

In [None]:
wf.visualizer_PCA_grouped(color_column = 'Type 2', marker_column='Type', cmap = 'viridis') 

In [None]:
wf.visualizer_PCA_grouped(color_column='Type 2', marker_column=None, cmap = 'viridis')

In [None]:
wf.visualizer_PCA_grouped(color_column='Type', marker_column=None, cmap = 'nipy_spectral', crossout_outliers=True)

### PLS-DA

#### Perform PLS-DA by grouping by different atributes (response variable)

In [None]:
# Perform PLS-DA 
wf.statistics_PLSDA(response_column_names='Type 2')

In [None]:
# Visualize the PLS-DA
wf.visualizer_PLSDA(cmap = 'viridis')

In [None]:
# Perform PLS-DA with different response column
wf.statistics_PLSDA(response_column_names='Type')

In [None]:
# Visualize the PLS-DA
wf.visualizer_PLSDA(cmap='viridis_r')

In [None]:
#Perform PLS-DA with multiple response columns (their combination)
wf.statistics_PLSDA(response_column_names=['Type 2', 'Type'])

In [None]:
# Visualize the PLS-DA
wf.visualizer_PLSDA(cmap='magma')

### Violin plots

In [None]:
# Visualize features as violin plots
wf.visualizer_violin_plots(column_names='Type 2', indexes = 'all', save_into_pdf=False, save_first=True, cmap='viridis')

In [None]:
# Visualize features as violin plots with different grouping column
wf.visualizer_violin_plots(column_names='Type', indexes = 'all', save_into_pdf=False,  save_first=True, cmap='viridis')

In [None]:
# Visualize features as violin plots with combination of grouping columns
wf.visualizer_violin_plots(column_names=['Type 2', 'Type'], indexes = [0, 1, 2, 3, 4], save_into_pdf=False, save_first=True, cmap='magma')

### Printing out candidate variables features (compounds)

In [None]:
print(wf.candidate_variables)

## Final touches

In [None]:
# Finalize report 
wf.finalizer_report()