In [None]:
import pandas as pd
import duckdb 
import pickle
from tqdm import tqdm
import shutil

In [None]:
pd.set_option('display.max_colwidth', None)  
pd.set_option('display.expand_frame_repr', False)  
pd.set_option('display.max_columns', None)

In [None]:
base_path = "/kaggle/input/preprocessed-capstone-data-3/data"

gene_df = pd.read_csv(f"{base_path}/gene_df.tsv", sep="\t")
allele_df = pd.read_csv(f"{base_path}/allele_df.tsv", sep="\t")
hgvs4variation_df = pd.read_csv(f"{base_path}/hgvs4variation_df.tsv", sep="\t")
cross_references_df = pd.read_csv(f"{base_path}/cross_references_df.tsv", sep="\t")
organization_summary_df = pd.read_csv(f"{base_path}/organization_summary_df_1.tsv", sep="\t")
submission_summary_df = pd.read_csv(f"{base_path}/submission_summary_df.tsv", sep="\t")
summary_of_conflicting_interpretations_df = pd.read_csv(f"{base_path}/summary_of_conflicting_interpretations_df.tsv", sep="\t")
var_citations_df = pd.read_csv(f"{base_path}/var_citations_df.tsv", sep="\t")

In [None]:
gene_list = []
for _, i in gene_df.iterrows():
    gene_disease = i["GeneLevelDisease"]
    
    if gene_disease.strip().lower() in ["", "no gene-disease association has been submitted to clinvar for this gene"]:
        disease_sentence = "No gene-disease association has been submitted to ClinVar for this gene"
    else:
        disease_sentence = f"This gene has been associated with: {gene_disease}"

    content_lines = [
        f"{i['GeneSymbol']} is the gene symbol, and its full name is {i['GeneName']}.",
        f"{disease_sentence}.".strip()
    ]
    doc = {
        'type': 'gene',
        'metadata': {
            'GeneID': i["GeneID"]
        },
        'page_content': "\n".join(content_lines)
    }

    gene_list.append(doc)

In [None]:
allele_list = []
for _, i in tqdm(allele_df.iterrows(), total=len(allele_df), desc="Processing Alleles"):
    doc = {
        'type': 'allele',
        'metadata': {
            'AlleleID': i["AlleleID"],
            'GeneID': i["GeneID"],
            'VariationID': i["VariationID"]
        },
        'page_content': f"""The allele is linked to the following phenotypes: {i['PhenotypeList']} with clinical significance as {i['ClinicalSignificance']}.
Testing for this variant is{' ' if i['TestedInGTR'] == 'Y' else ' not '}registered in GTR.
""".strip()
    }
    allele_list.append(doc)

In [None]:
organization_summary_list = []
for _, i in tqdm(organization_summary_df.iterrows(), total=len(organization_summary_df)):
    doc = {
        'type': 'organization_summary',
        'metadata': {
            'OrganizationID': i["OrganizationID"]
        },
        'page_content': f"""{i['OrganizationName']} is an organization of type "{i['InstitutionType']}".
The highest review status achieved is: {i['MaximumReviewStatus']}.
Submission collection methods include: {i['CollectionMethods']}.
The clinical significance categories submitted by this organization include: {i['ClinicalSignificanceCategoriesSubmitted']}.
        """.strip()
    }

    organization_summary_list.append(doc)

In [None]:
submission_summary_list = []
for _, i in tqdm(submission_summary_df.iterrows(), total=len(submission_summary_df)):
    doc = {
        'type': 'submission_summary',
        'metadata': {
            'VariationID': i["VariationID"]
        },
        'page_content': f"""
This submission was made by: {i['Submitter']}.

The clinical significance of the variant in this submission is classified as: {i['ClinicalSignificance']}.
Review status for this interpretation is: {i['ReviewStatus']}.

The condition was interpreted based on the following submitted phenotype information: {i['SubmittedPhenotypeInfo']}.
ClinVar reports this using the MedGen terms: {i['ReportedPhenotypeInfo']}.

Collection method used for this data: {i['CollectionMethod']}.
Reported origin(s) and number of observations: {i['OriginCounts']}.

Explanation provided for the interpretation: {i['ExplanationOfInterpretation']}.

Contributes to aggregate classification: {i['ContributesToAggregateClassification']}.

Additional description provided by the submitter:
{i['Description']}
        """.strip()
    }

    submission_summary_list.append(doc)

In [None]:
summary_of_conflicting_interpretations_list = []
for _, i in tqdm(summary_of_conflicting_interpretations_df.iterrows(), total=len(summary_of_conflicting_interpretations_df)):

    doc = {
        'type': 'summary_of_conflicting_interpretations',
        'metadata': {
            'VariationID': i["VariationID"]
        },
        'page_content': f"""
Two submitters have provided differing interpretations for the variant described as: {i['ClinVar_Preferred']}.

Submitter 1: {i['Submitter1']}
- Clinical significance: {i['Submitter1_ClinSig']}
- Review status: {i['Submitter1_ReviewStatus']}
- Interpretation description: {i['Submitter1_Description']}
- Method(s) used: {i['Submitter1_Method']}

Submitter 2: {i['Submitter2']}
- Clinical significance: {i['Submitter2_ClinSig']}
- Review status: {i['Submitter2_ReviewStatus']}
- Interpretation description: {i['Submitter2_Description']}
- Method(s) used: {i['Submitter2_Method']}

Conflict reported: {i['Conflict_Reported']}
Rank difference between interpretations: {i['Rank_diff']}
Variant type: {i['Variant_type']}
        """.strip()
    }

    summary_of_conflicting_interpretations_list.append(doc)