In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm
import pickle
import umap
import scipy.sparse as sparse
from scipy.io import mmwrite
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression

In [110]:
def pickle_reader(path):
    emb_input = pd.read_pickle(path)
    
    return emb_input

def reformat_pickle(emb_input):
    """Change every dash (-) by underscore to allow datasets merging"""
    
    reformatted_emb = {}
    for k, v in emb_input.items():
        new_k = k.replace('-', '_')
        v = v
        reformatted_emb[new_k] = v
        
    return reformatted_emb

def reformat_df(ORF2_counts):
    """Change every dash (-) and dot (.) by underscore (_) to allow datasets merging"""
    
    reformatted_counts_dict = {}
    counts = ORF2_counts['Non_redundant']
    names = ORF2_counts['ID']
    for i in range(len(ORF2_counts)):
        number = counts[i]
        ID = names[i]
        not_dash = ID.replace('-', '_')
        clean = not_dash.replace('.', '_')
        #print(clean)
        reformatted_counts_dict[clean] = number
        #reformatted_counts_dict
    
    reformatted = pd.DataFrame.from_dict(reformatted_counts_dict, orient='index')
    reformatted_counts = reformatted.reset_index()
    reformatted_counts.columns = ['ID', 'Non_redundant']
    
    return reformatted_counts
    
def pickle_to_matrix(reformatted_emb):    # Reading pickle file containing embeddings
    matrix_inverted = pd.DataFrame(reformatted_emb)  # Create df from emb dict
    matrix = matrix_inverted.transpose()  # Invert rows and columns to make it fit for Anndata object
    #matrix.index = ["dimension" + str(i) for i in range(1, matrix.shape[0]+1)]  # dimension as rows, each entry as a column
    
    return matrix 

def match_ID(matrix, reformatted_counts):
    """Matching matrix (embs) and IDs from counts file to use embs observations with counts associated (all)"""
    
   #matched_IDs = matrix.loc[:, matrix.index.isin(reformatted_counts["ID"].values)]
    matched_emb = matrix.loc[matrix.index.isin(reformatted_counts['ID'].values)]
    matched_counts = reformatted_counts.loc[reformatted_counts['ID'].isin(matrix.index)] 
    
    return matched_emb, matched_counts

def extract_weights(matched_emb):
    """Extracting PC weights with sklearn (from embeddings file)"""
    
    pca = PCA()  # Define PCA function
    my_pca = pca.fit_transform(matched_emb)   # Scale data and apply PCA on embeddings df
    dataframe_pca = pd.DataFrame(my_pca)  # Convert to df
    
    df_pca_loadings = pd.DataFrame(pca.components_) # Show Principal Components Weights (Eigenvectors)
    a = df_pca_loadings.describe() # Print mean of every dimension (column) for all sequences 
    b = pd.DataFrame(a.iloc[1])       # Select mean data row (2nd)
    c = b.reset_index()    #Displace index, rename columns and sort
    c.columns = ['Dimension', 'Weight']
    eigenvectors = c.sort_values(by=['Weight'], ascending=False)
    
    return eigenvectors


In [6]:
# -- Read pickle file containing embeddings and reformat to remove dash that block merge
path = '/Users/leandro/Desktop/ai_data/data/embeddings_v2.pickle'
emb_input = pickle_reader(path)
reformatted_emb = reformat_pickle(emb_input)
#reformatted_emb

In [7]:
# -- Read ORF2_counts file and reformat to remove dash and dots by underscore
path_counts = '/Users/leandro/Desktop/ai_data/data/ORF2p_counts.csv'
ORF2_counts = pd.read_csv(path_counts)
reformatted_counts = reformat_df(ORF2_counts)
#reformatted_counts

In [8]:
# -- Creating matrix from reformatted_emb dict and filtering counts and emb dataset to same length
matrix = pickle_to_matrix(reformatted_emb)
#matrix
matched_emb = match_ID(matrix, reformatted_counts)[0]
#matches_emb
matched_counts = match_ID(matrix, reformatted_counts)[1]
#matched_counts

In [9]:
print(len(matrix.index), len(matched_emb.index), len(matched_counts.index))  # Test number of matchs between datasets

721 720 720


**Extracting PC weights with sklearn (from embeddings file)**

In [111]:
eigenvectors = extract_weights(matched_emb)

In [112]:
eigenvectors

Unnamed: 0,Dimension,Weight
369,369,0.003191
944,944,0.003189
198,198,0.003141
211,211,0.003136
377,377,0.002939
...,...,...
628,628,-0.003039
855,855,-0.003161
19,19,-0.003285
62,62,-0.003570


In [48]:
# Percentage of variance explainedw with each PC (correlates to PCA made with scanpy)
print(pca.explained_variance_ratio_.round(2)[:10])

[0.46 0.11 0.09 0.05 0.04 0.04 0.03 0.02 0.02 0.02]


In [49]:
# Principal Components Weights (Eigenvectors)
len(df_pca_loadings.index)

720