<a href="https://colab.research.google.com/github/Kiron-Ang/DSC/blob/main/BINF3350/data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python -V

In [None]:
import pandas as pd

url_puerto_rican_population = (
  "https://github.com/Kiron-Ang/DSC/blob/main/BINF3350/puerto_rican_population.vcf?raw=true"
)
puerto_rican_population = pd.read_csv(url_puerto_rican_population, sep="\t")

url_other_populations = (
  "https://github.com/Kiron-Ang/DSC/blob/main/BINF3350/other_populations.vcf?raw=true"
)
other_populations = pd.read_csv(url_other_populations, sep="\t")

def print_preview(df, name):
  print(f"Preview of {name} dataframe:")
  print(df.head())
  print(f"Shape of {name} dataframe: {df.shape[0]} rows, {df.shape[1]} columns")
  print("\n" + "-"*50 + "\n")

print_preview(puerto_rican_population, "Puerto Rican Population")
print_preview(other_populations, "Other Populations")

In [None]:
assert len(puerto_rican_population) == len(other_populations), \
    f"Length mismatch: puerto_rican_population has {len(puerto_rican_population)} rows, " \
    f"but other_populations has {len(other_populations)} rows."

In [None]:
def count_categories_in_population(df, start_index=9):
    nested_list = []
    for index, row in df.iterrows():
        people_with_none = 0
        people_with_at_least_one = 0
        for value in row[start_index:]:
            if value == "0/0":
                people_with_none += 1
            elif value == "1/0" or value == "0/1" or value == "1/1":
                people_with_at_least_one += 1
        nested_list.append([people_with_none, people_with_at_least_one])
    return nested_list

puerto_rican_nested_list = count_categories_in_population(puerto_rican_population)
other_populations_nested_list = count_categories_in_population(other_populations)

print("Puerto Rican Population with SNPS (No alleles, at least one):")
print(puerto_rican_nested_list)

print("\nOther Populations with SNPS (No alleles, at least one):")
print(other_populations_nested_list)

In [None]:
# Assert that the two nested lists have the same length
assert len(puerto_rican_nested_list) == len(other_populations_nested_list), \
    f"Length mismatch: Puerto Rican list has {len(puerto_rican_nested_list)} items, " \
    f"while Other Populations list has {len(other_populations_nested_list)} items."

In [None]:
# Initialize an empty list to hold the contingency table data
contingency_table_list = []

# Loop through the range of numbers from 0 to the length of puerto_rican_nested_list
for number in range(0, len(puerto_rican_nested_list)):

    # Create a temporary list with the first elements of puerto_rican_nested_list and other_populations_nested_list
    temp_list = [puerto_rican_nested_list[number], other_populations_nested_list[number]]

    # Append the temporary list to the contingency_table_list
    contingency_table_list.append(temp_list)

# Print the final contingency table list
print("\nFinal list of contingency tables:")
print(contingency_table_list)

In [None]:
from scipy.stats import chi2_contingency
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html

rs_numbers = puerto_rican_population["ID"].to_list()
print(rs_numbers)

print("RS Number\tp-value\t% of Puerto Ricans with at least one copy of the variant\t% of Non-PR with at least one copy of the variant")

for number in range(0, len(rs_numbers)):
  results = chi2_contingency(contingency_table_list[number])
  percentage = puerto_rican_nested_list[number][1] / sum(puerto_rican_nested_list[number]) * 100
  percentage = other_populations_nested_list[number][1] / sum(other_populations_nested_list[number]) * 100
  print(f"{rs_numbers[number]}\t{results.pvalue}\t{percentage}\t{percentage}")