<a href="https://colab.research.google.com/github/Kiron-Ang/DSC/blob/main/binf_3350_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python -V

Python 3.10.12


In [2]:
import pandas as pd

url_puerto_rican_population = (
  "https://github.com/Kiron-Ang/DSC/blob/main/binf_3350_puerto_rican_population.vcf?raw=true"
)
puerto_rican_population = pd.read_csv(url_puerto_rican_population, sep="\t")

url_other_populations = (
  "https://github.com/Kiron-Ang/DSC/blob/main/binf_3350_other_populations.vcf?raw=true"
)
other_populations = pd.read_csv(url_other_populations, sep="\t")

def print_preview(df, name):
  print(f"Preview of {name} dataframe:")
  print(df.head())
  print(f"Shape of {name} dataframe: {df.shape[0]} rows, {df.shape[1]} columns")
  print("\n" + "-"*50 + "\n")

print_preview(puerto_rican_population, "Puerto Rican Population")
print_preview(other_populations, "Other Populations")

Preview of Puerto Rican Population dataframe:
   #CHROM        POS                        ID REF ALT QUAL FILTER  \
0       1  146648224                 rs4950382   C   T    .   PASS   
1       1  159802683  rs3795331;SNP1-158069307   G   A    .   PASS   
2       1  161298126            SNP1-159564750   T   G    .   PASS   
3       2   21397182                  rs506585   G   A    .   PASS   
4       2   21415763             SNP2-21269268   G   A    .   PASS   

                                          INFO FORMAT HG00551  ... HG01199  \
0    AC=1423;AF=0.309;AN=4604;set=Intersection     GT     0/0  ...     0/0   
1     AC=218;AF=0.047;AN=4636;set=Intersection     GT     0/0  ...     0/0   
2  AC=14;AF=3.046e-03;AN=4596;set=Intersection     GT     0/0  ...     0/0   
3    AC=3795;AF=0.819;AN=4636;set=Intersection     GT     1/1  ...     1/1   
4    AC=3440;AF=0.756;AN=4552;set=Intersection     GT     1/1  ...     1/1   

  HG01204 HG01205 HG01206 HG01241 HG01242 HG01243 HG01247 HG0124

In [3]:
assert len(puerto_rican_population) == len(other_populations), \
    f"Length mismatch: puerto_rican_population has {len(puerto_rican_population)} rows, " \
    f"but other_populations has {len(other_populations)} rows."

In [5]:
def count_categories_in_population(df, start_index=9):
    """
    Counts the occurrences of categories ("0/0", "1/0" or "0/1", "1/1") for each person in the DataFrame.

    Args:
    - df (pd.DataFrame): DataFrame containing the population data.
    - start_index (int): Index to start iterating over for each person (default is 9).

    Returns:
    - list: A list of lists, each containing counts for "none", "one", and "two" for each person.
    """
    # Initialize an empty list to hold the nested lists for each individual
    nested_list = []

    # Iterate through each row in the dataframe
    for index, row in df.iterrows():
        # Initialize counters for each possible category (none, one, two)
        people_with_none = 0
        people_with_one = 0
        people_with_two = 0

        # Iterate through the values starting from the specified start index for each person
        for value in row[start_index:]:
            # Check if the value is "0/0" and increment the people_with_none counter
            if value == "0/0":
                people_with_none += 1
            # Check if the value is either "1/0" or "0/1" and increment the people_with_one counter
            elif value == "1/0" or value == "0/1":
                people_with_one += 1
            # Check if the value is "1/1" and increment the people_with_two counter
            elif value == "1/1":
                people_with_two += 1

        # Append the counts of "none", "one", and "two" to the nested list
        nested_list.append([people_with_none, people_with_one, people_with_two])

    # Return the final nested list containing the counts for each person
    return nested_list

# Call the function `count_categories_in_population` to process the 'puerto_rican_population' DataFrame
# and store the results in the `puerto_rican_nested_list`
puerto_rican_nested_list = count_categories_in_population(puerto_rican_population)

# Call the function `count_categories_in_population` to process the 'other_populations'
# and store the results in the `other_populations_nested_list`
other_populations_nested_list = count_categories_in_population(other_populations)

# Display the results for the Puerto Rican population categories
print("Puerto Rican Population Counts (None, One, Two):")
print(puerto_rican_nested_list)

# Display the results for the other population categories
print("\nOther Populations Counts (None, One, Two):")
print(other_populations_nested_list)

Puerto Rican Population Counts (None, One, Two):
[[50, 40, 9], [98, 2, 0], [99, 0, 0], [3, 32, 65], [4, 28, 65], [99, 1, 0], [29, 43, 28], [99, 1, 0], [10, 49, 41], [100, 0, 0], [100, 0, 0], [10, 37, 53], [75, 24, 1], [99, 0, 0], [67, 28, 4], [95, 4, 0], [2, 25, 72], [28, 48, 24], [52, 42, 6], [12, 51, 37], [91, 8, 1], [26, 52, 22], [96, 4, 0], [97, 3, 0], [41, 49, 10], [100, 0, 0], [92, 8, 0], [14, 41, 43], [100, 0, 0], [59, 37, 4], [91, 9, 0], [46, 43, 11], [25, 42, 33], [41, 43, 16], [12, 52, 36], [98, 2, 0], [31, 46, 22], [52, 39, 9], [76, 21, 3], [16, 45, 38], [53, 38, 9], [76, 23, 1]]

Other Populations Counts (None, One, Two):
[[1064, 913, 226], [2016, 188, 14], [2185, 14, 0], [116, 571, 1531], [236, 604, 1339], [2106, 104, 8], [636, 1072, 510], [2102, 106, 8], [198, 562, 1281], [2193, 21, 1], [2209, 9, 0], [762, 717, 738], [1573, 526, 118], [2214, 2, 0], [1695, 457, 59], [2093, 79, 0], [189, 582, 1432], [666, 1047, 504], [1333, 731, 152], [271, 949, 997], [1921, 279, 17], [689,

In [12]:
# Assert that the two nested lists have the same length
assert len(puerto_rican_nested_list) == len(other_populations_nested_list), \
    f"Length mismatch: Puerto Rican list has {len(puerto_rican_nested_list)} items, " \
    f"while Other Populations list has {len(other_populations_nested_list)} items."

In [20]:
# Initialize an empty list to hold the contingency table data
contingency_table_list = []
print("Initializing empty list for the contingency table.")

# Loop through the range of numbers from 0 to the length of puerto_rican_nested_list
for number in range(0, len(puerto_rican_nested_list)):
    print(f"Processing element {number + 1}/{len(puerto_rican_nested_list)}...")

    # Create a temporary list with the first elements of puerto_rican_nested_list and other_populations_nested_list
    temp_list = [puerto_rican_nested_list[number], other_populations_nested_list[number]]
    print(f"Created temporary list: {temp_list}")

    # Append the temporary list to the contingency_table_list
    contingency_table_list.append(temp_list)
    print(f"Appended temporary list to the contingency table. Current table size: {len(contingency_table_list)}")

# Print the final contingency table list
print("\nFinal list of contingency tables:")
print(contingency_table_list)

Initializing empty list for the contingency table.
Processing element 1/42...
Created temporary list: [[50, 40, 9], [1064, 913, 226]]
Appended temporary list to the contingency table. Current table size: 1
Processing element 2/42...
Created temporary list: [[98, 2, 0], [2016, 188, 14]]
Appended temporary list to the contingency table. Current table size: 2
Processing element 3/42...
Created temporary list: [[99, 0, 0], [2185, 14, 0]]
Appended temporary list to the contingency table. Current table size: 3
Processing element 4/42...
Created temporary list: [[3, 32, 65], [116, 571, 1531]]
Appended temporary list to the contingency table. Current table size: 4
Processing element 5/42...
Created temporary list: [[4, 28, 65], [236, 604, 1339]]
Appended temporary list to the contingency table. Current table size: 5
Processing element 6/42...
Created temporary list: [[99, 1, 0], [2106, 104, 8]]
Appended temporary list to the contingency table. Current table size: 6
Processing element 7/42...
C

In [22]:
from scipy.stats import chi2_contingency
for table in contingency_table_list:
  results = chi2_contingency(table)
  print(results)

Chi2ContingencyResult(statistic=0.2466835459383316, pvalue=0.8839614967823309, dof=2, expected_freq=array([[  47.90877498,   40.98479583,   10.10642919],
       [1066.09122502,  912.01520417,  224.89357081]]))
Chi2ContingencyResult(statistic=6.057122710633648, pvalue=0.04838519718536032, dof=2, expected_freq=array([[9.11993097e+01, 8.19672131e+00, 6.03968939e-01],
       [2.02280069e+03, 1.81803279e+02, 1.33960311e+01]]))


ValueError: The internally computed table of expected frequencies has a zero element at (0, 2).