<a href="https://colab.research.google.com/github/Kiron-Ang/DSC/blob/main/binf_3350_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python -V

Python 3.10.12


In [2]:
import pandas as pd

url_puerto_rican_population = (
  "https://github.com/Kiron-Ang/DSC/blob/main/binf_3350_puerto_rican_population.vcf?raw=true"
)
puerto_rican_population = pd.read_csv(url_puerto_rican_population, sep="\t")

url_other_populations = (
  "https://github.com/Kiron-Ang/DSC/blob/main/binf_3350_other_populations.vcf?raw=true"
)
other_populations = pd.read_csv(url_other_populations, sep="\t")

def print_preview(df, name):
  print(f"Preview of {name} dataframe:")
  print(df.head())
  print(f"Shape of {name} dataframe: {df.shape[0]} rows, {df.shape[1]} columns")
  print("\n" + "-"*50 + "\n")

print_preview(puerto_rican_population, "Puerto Rican Population")
print_preview(other_populations, "Other Populations")

Preview of Puerto Rican Population dataframe:
   #CHROM        POS                        ID REF ALT QUAL FILTER  \
0       1  146648224                 rs4950382   C   T    .   PASS   
1       1  159802683  rs3795331;SNP1-158069307   G   A    .   PASS   
2       1  161298126            SNP1-159564750   T   G    .   PASS   
3       2   21397182                  rs506585   G   A    .   PASS   
4       2   21415763             SNP2-21269268   G   A    .   PASS   

                                          INFO FORMAT HG00551  ... HG01199  \
0    AC=1423;AF=0.309;AN=4604;set=Intersection     GT     0/0  ...     0/0   
1     AC=218;AF=0.047;AN=4636;set=Intersection     GT     0/0  ...     0/0   
2  AC=14;AF=3.046e-03;AN=4596;set=Intersection     GT     0/0  ...     0/0   
3    AC=3795;AF=0.819;AN=4636;set=Intersection     GT     1/1  ...     1/1   
4    AC=3440;AF=0.756;AN=4552;set=Intersection     GT     1/1  ...     1/1   

  HG01204 HG01205 HG01206 HG01241 HG01242 HG01243 HG01247 HG0124

In [3]:
assert len(puerto_rican_population) == len(other_populations), \
    f"Length mismatch: puerto_rican_population has {len(puerto_rican_population)} rows, " \
    f"but other_populations has {len(other_populations)} rows."

In [4]:
def count_categories_in_population(df, start_index=9):
    nested_list = []
    for index, row in df.iterrows():
        people_with_none = 0
        people_with_at_least_one = 0
        for value in row[start_index:]:
            if value == "0/0":
                people_with_none += 1
            elif value == "1/0" or value == "0/1" or value == "1/1":
                people_with_at_least_one += 1
        nested_list.append([people_with_none, people_with_at_least_one])
    return nested_list

puerto_rican_nested_list = count_categories_in_population(puerto_rican_population)
other_populations_nested_list = count_categories_in_population(other_populations)

print("Puerto Rican Population with SNPS (No alleles, at least one):")
print(puerto_rican_nested_list)

print("\nOther Populations with SNPS (No alleles, at least one):")
print(other_populations_nested_list)

Puerto Rican Population with SNPS (No alleles, at least one):
[[50, 49], [98, 2], [99, 0], [3, 97], [4, 93], [99, 1], [29, 71], [99, 1], [10, 90], [100, 0], [100, 0], [10, 90], [75, 25], [99, 0], [67, 32], [95, 4], [2, 97], [28, 72], [52, 48], [12, 88], [91, 9], [26, 74], [96, 4], [97, 3], [41, 59], [100, 0], [92, 8], [14, 84], [100, 0], [59, 41], [91, 9], [46, 54], [25, 75], [41, 59], [12, 88], [98, 2], [31, 68], [52, 48], [76, 24], [16, 83], [53, 47], [76, 24]]

Other Populations with SNPS (No alleles, at least one):
[[1064, 1139], [2016, 202], [2185, 14], [116, 2102], [236, 1943], [2106, 112], [636, 1582], [2102, 114], [198, 1843], [2193, 22], [2209, 9], [762, 1455], [1573, 644], [2214, 2], [1695, 516], [2093, 79], [189, 2014], [666, 1551], [1333, 883], [271, 1946], [1921, 296], [689, 1496], [2113, 104], [2148, 69], [747, 1266], [2156, 60], [1761, 454], [122, 2080], [2199, 14], [1222, 995], [2133, 77], [895, 1320], [685, 1529], [775, 1443], [778, 1439], [2192, 25], [654, 1526], [879

In [5]:
# Assert that the two nested lists have the same length
assert len(puerto_rican_nested_list) == len(other_populations_nested_list), \
    f"Length mismatch: Puerto Rican list has {len(puerto_rican_nested_list)} items, " \
    f"while Other Populations list has {len(other_populations_nested_list)} items."

In [6]:
# Initialize an empty list to hold the contingency table data
contingency_table_list = []

# Loop through the range of numbers from 0 to the length of puerto_rican_nested_list
for number in range(0, len(puerto_rican_nested_list)):

    # Create a temporary list with the first elements of puerto_rican_nested_list and other_populations_nested_list
    temp_list = [puerto_rican_nested_list[number], other_populations_nested_list[number]]

    # Append the temporary list to the contingency_table_list
    contingency_table_list.append(temp_list)

# Print the final contingency table list
print("\nFinal list of contingency tables:")
print(contingency_table_list)


Final list of contingency tables:
[[[50, 49], [1064, 1139]], [[98, 2], [2016, 202]], [[99, 0], [2185, 14]], [[3, 97], [116, 2102]], [[4, 93], [236, 1943]], [[99, 1], [2106, 112]], [[29, 71], [636, 1582]], [[99, 1], [2102, 114]], [[10, 90], [198, 1843]], [[100, 0], [2193, 22]], [[100, 0], [2209, 9]], [[10, 90], [762, 1455]], [[75, 25], [1573, 644]], [[99, 0], [2214, 2]], [[67, 32], [1695, 516]], [[95, 4], [2093, 79]], [[2, 97], [189, 2014]], [[28, 72], [666, 1551]], [[52, 48], [1333, 883]], [[12, 88], [271, 1946]], [[91, 9], [1921, 296]], [[26, 74], [689, 1496]], [[96, 4], [2113, 104]], [[97, 3], [2148, 69]], [[41, 59], [747, 1266]], [[100, 0], [2156, 60]], [[92, 8], [1761, 454]], [[14, 84], [122, 2080]], [[100, 0], [2199, 14]], [[59, 41], [1222, 995]], [[91, 9], [2133, 77]], [[46, 54], [895, 1320]], [[25, 75], [685, 1529]], [[41, 59], [775, 1443]], [[12, 88], [778, 1439]], [[98, 2], [2192, 25]], [[31, 68], [654, 1526]], [[52, 48], [879, 1331]], [[76, 24], [1478, 740]], [[16, 83], [622

In [7]:
from scipy.stats import chi2_contingency
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html

rs_numbers = puerto_rican_population["ID"].to_list()
print(rs_numbers)

for number in range(0, len(rs_numbers)):
  results = chi2_contingency(contingency_table_list[number])
  if results.pvalue < 0.05:
    print(f"Significant result! RS Number: {rs_numbers[number]}, p-value: {results.pvalue}")
    percentage = puerto_rican_nested_list[number][0] / sum(puerto_rican_nested_list[number]) * 100
    print(f"\t Puerto Ricans without any copies of the allele: {puerto_rican_nested_list[number][0]} ({percentage}%)")
    percentage = puerto_rican_nested_list[number][1] / sum(puerto_rican_nested_list[number]) * 100
    print(f"\t Puerto Ricans with at least one copy of the allele: {puerto_rican_nested_list[number][1]} ({percentage}%)")
    percentage = other_populations_nested_list[number][0] / sum(other_populations_nested_list[number]) * 100
    print(f"\t Other people without any copies of the allele: {other_populations_nested_list[number][0]} ({percentage}%)")
    percentage = other_populations_nested_list[number][1] / sum(other_populations_nested_list[number]) * 100
    print(f"\t Other people with at least one copy of the allele: {other_populations_nested_list[number][1]} ({percentage}%)")

['rs4950382', 'rs3795331;SNP1-158069307', 'SNP1-159564750', 'rs506585', 'SNP2-21269268', 'SNP2-21269573', 'rs950661', 'rs7608048', 'SNP2-44013792', 'SNP2-44029175', 'SNP2-44071275', 'rs6746182', 'SNP2-88367318', 'rs4135293', 'rs13099634', 'rs5746247', 'rs2946392', 'SNP4-23596461', 'SNP4-23634786', 'SNP4-100925984', 'SNP4-120783281', 'rs2303937', 'SNP7-150432276', 'SNP8-19961055', 'SNP8-59734102', 'SNP8-59739383', 'SNP9-106647033', 'rs2740483', 'SNP9-106759583', 'rs715119', 'SNP11-116421290', 'rs7109649', 'rs11172156', 'rs10846765', 'rs7304293', 'SNP15-56799478', 'SNP15-88141435', 'rs4939593', 'SNP18-45436312', 'SNP19-50280302', 'rs3745157', 'SNP22-45030287']
Significant result! RS Number: rs3795331;SNP1-158069307, p-value: 0.022991348515797184
	 Puerto Ricans without any copies of the allele: 98 (98.0%)
	 Puerto Ricans with at least one copy of the allele: 2 (2.0%)
	 Other people without any copies of the allele: 2016 (90.892696122633%)
	 Other people with at least one copy of the alle