# Classification into groups

In the study [Haplotype analyses reveal novel insights into tomato history and domestication including long-distance migrations and latitudinal adaptations](https://www.biorxiv.org/content/10.1101/2021.06.18.448912v2) tomato accessions were genotyped by Whole Genome Sequencing.

We have the genotypes in the file "tomato_genotypes.csv.gz" and we want to classify some of them into groups.
The accessions to classify and its classification is provided in the Python dictionary named "classification".
Our task is to replace the 'none' classification set for every accession with a suitable classification.

We will classify the accessions by doing a Principal Component Analysis with the genotypes. Also, we have passport data in the file "suppl_table_2_passport_and_morphological_data.csv.gz". For example, we have the passport "Taxon" and "Country". So, we want to classify the accessions into groups by taking into account the Principal Component Analysis result and this passport data.


In [None]:
import pandas
import plotly.express as px
import pca


classification = {
    'bgv005895': 'none', 'bgv005912': 'none', 'bgv006234': 'none',
    'bgv006235': 'none', 'bgv006327': 'none', 'bgv006336': 'none',
    'bgv006347': 'none', 'bgv006353': 'none', 'bgv006370': 'none',
    'bgv006454': 'none', 'bgv006457': 'none', 'bgv006767': 'none',
    'bgv006775': 'none', 'bgv006825': 'none', 'bgv006852': 'none',
    'bgv006859': 'none', 'bgv006867': 'none', 'bgv006904': 'none',
    'bgv006910': 'none', 'bgv006931': 'none', 'bgv006934': 'none',
    'bgv007023': 'none', 'bgv007111': 'none', 'bgv007149': 'none',
    'bgv007151': 'none', 'bgv007152': 'none', 'bgv007158': 'none',
    'bgv007161': 'none', 'bgv007169': 'none', 'bgv007181': 'none',
    'bgv007194': 'none', 'bgv007198': 'none', 'bgv007854': 'none',
    'bgv007857': 'none', 'bgv007860': 'none', 'bgv007862': 'none',
    'bgv007863': 'none', 'bgv007864': 'none', 'bgv007865': 'none',
    'bgv007867': 'none', 'bgv007870': 'none', 'bgv007871': 'none',
    'bgv007872': 'none', 'bgv007875': 'none', 'bgv007876': 'none',
    'bgv007878': 'none', 'bgv007901': 'none', 'bgv007902': 'none',
    'bgv007908': 'none', 'bgv007909': 'none', 'bgv007910': 'none',
    'bgv007911': 'none', 'bgv007918': 'none', 'bgv007921': 'none',
    'bgv007927': 'none', 'bgv007931': 'none', 'bgv007933': 'none',
    'bgv007934': 'none', 'bgv007936': 'none', 'bgv007981': 'none',
    'bgv007989': 'none', 'bgv008036': 'none', 'bgv008037': 'none',
    'bgv008041': 'none', 'bgv008051': 'none', 'bgv008058': 'none',
    'bgv008061': 'none', 'bgv008065': 'none', 'bgv008067': 'none',
    'bgv008070': 'none', 'bgv008077': 'none', 'bgv008095': 'none',
    'bgv008098': 'none', 'bgv008100': 'none', 'bgv008106': 'none',
    'bgv008108': 'none', 'bgv008218': 'none', 'bgv008219': 'none',
    'bgv008223': 'none', 'bgv008224': 'none', 'bgv008225': 'none',
    'bgv008345': 'none', 'bgv008348': 'none', 'bgv012615': 'none',
    'bgv012625': 'none', 'bgv012626': 'none', 'bgv012627': 'none',
    'bgv012640': 'none', 'bgv013134': 'none', 'bgv013161': 'none',
    'bgv013175': 'none', 'bgv013945': 'none', 'bgv015726': 'none',
    'bgv015734': 'none', 'la1712': 'none'}


taxon_column_name = 'Taxon'
country_column_name = 'Country'
clasification_column_name = 'Classification'

def plot_projections_by_taxon_and_country(pca_result, passport_data, color_column=None):

    projections = data=pca_result['projections']
    dframe_to_plot = projections.join(passport_data)
    fig = px.scatter(dframe_to_plot, x='princomp_1', y='princomp_2', color=color_column,
                     hover_data=[dframe_to_plot.index, "Taxon", "Country", "Classification"])
    fig.show()


genotypes = pandas.read_csv('tomato_genotypes.csv.gz')
accession_passport_data = pandas.read_csv('tomato_passport_and_morphological_data.csv.gz',
                                          skiprows=22,
                                          index_col=0)
accession_passport_data.index = list(map(lambda x: x.lower(), accession_passport_data.index))
classifications = [classification.get(accession, 'nd') for accession in accession_passport_data.index]
accession_data = accession_passport_data
accession_data[clasification_column_name] = classifications

accessions_to_use = sorted(classification.keys())
genotypes = genotypes.loc[:, accessions_to_use]

pca_result = pca.do_pca_from_012_genotypes(genotypes)

plot_projections_by_taxon_and_country(pca_result, accession_data)