In [None]:
import pandas as pd

Load the data `matrix.csv` file that has processed sample data by `vcf_to_matrix.py`

In [None]:
df = pd.read_csv("/content/drive/MyDrive/colab data/matrix.csv")
df

As sample column does not have any name, rename the `Unnamed: 0` to `Sample`

In [None]:
df.rename(columns={'Unnamed: 0': 'Sample'}, inplace=True)
df

In [None]:
non_snp_columns = ['Sample', 'Population code']
df_snps = df.drop(non_snp_columns, axis=1)
matrix = df_snps.to_numpy()  # make a numpy array with numeric only columns to use pca directly

In [None]:
from sklearn import decomposition

pca = decomposition.PCA(n_components=2)
pca.fit(matrix)

# # check pca features
# print(pca.explained_variance_ratio_)
# print(pca.singular_values_)

to_plot = pca.transform(matrix)
to_plot.shape

(1092, 2)

Now that we have data produced by pca, we can plot it.

First things first, we should reduce the dataframe down to only those columns we are using to plot since `altair` encodes all the columns in the df wether or not we use them in the plot.

In [None]:
import altair as alt

df_plot = df[non_snp_columns].copy()  
df_plot['PC1'] = to_plot[:, 0]
df_plot['PC2'] = to_plot[:, 1]

population = pd.read_csv("/content/drive/MyDrive/colab data/igsr_populations.tsv", sep='\t')

df_plot = df_plot.merge(population, on="Population code", how="inner")

alt.Chart(df_plot).mark_point().encode(
    x='PC1',
    y='PC2',
    color=alt.Color('Population code', scale=alt.Scale(scheme="category20")),
)


In [None]:
alt.Chart(df_plot).mark_point().encode(
    x='PC1',
    y='PC2',
    color=alt.Color('Superpopulation name', scale=alt.Scale(scheme="category20")),
    fill='Population code',
)


tSNE


In [None]:
from sklearn.manifold import TSNE

X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random').fit_transform(matrix)
X_embedded.shape

(1092, 2)

In [None]:
df_plot['tsne1'] = X_embedded[:,0]
df_plot['tsne2'] = X_embedded[:,1]

alt.Chart(df_plot).mark_point().encode(
    x='tsne1',
    y='tsne2',
    color=alt.Color('Superpopulation name', scale=alt.Scale(scheme="category20")),
)
