### Goal:
- Take a look at the data, examine collected variables and identify potential connections.
- Find connections between data points and compare, isolate individual variables to inspect further.
- Clean data, identify where NA answers occur (Is there a connection between those who answered NA and their other answers?)
- Organize multi-select questions into individual categories.
- Form and tests hypothesis using data. 

In [47]:
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import pandas as pd

variable_df = pd.read_csv("var_names.csv")

original_df = pd.read_csv('CSCS_data_anon.csv', low_memory=False)
original_df

Unnamed: 0,UNIQUE_id,UNIQUE_num_records,ELIGIBLE_consent,GEO_residence_canada,GEO_province,DEMO_age,DEMO_gender,DEMO_identity_vetrans,DEMO_identity_indigenous,DEMO_identity_lgbtq,...,PSYCH_body_self_image_questionnaire_height_dissatisfaction_score,PSYCH_body_self_image_questionnaire_fatness_evaluation_score,PSYCH_body_self_image_questionnaire_negative_affect_score,PSYCH_body_self_image_questionnaire_social_dependence_score,PSYCH_big_five_inventory_agreeable_score,PSYCH_big_five_inventory_conscientious_score,PSYCH_big_five_inventory_extraverted_score,PSYCH_big_five_inventory_neurotic_score,PSYCH_big_five_inventory_open_score,REMOVE_case
0,cscs_00001,1,Yes,Yes,British Columbia,71.0,Non-binary,,,"Sexual or gender minorities (e.g., LGBTQ2+)",...,,,,,,,,,,No
1,cscs_00002,1,Yes,Yes,Ontario,69.0,Woman,,,Not Selected,...,3.0,8.0,3.0,3.0,,,,,,No
2,cscs_00003,1,Yes,Yes,Quebec,56.0,Woman,,,Not Selected,...,,,,,,,,,,No
3,cscs_00005,1,Yes,Yes,,54.0,Woman,,,Not Selected,...,,,,,28.0,34.0,30.0,32.0,37.0,No
4,cscs_00006,1,Yes,Yes,Ontario,30.0,Man,Not Selected,"Indigenous peoples (e.g., First Nations, Métis...","Sexual or gender minorities (e.g., LGBTQ2+)",...,,,,,,,,,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11426,cscs_11809,1,Yes,Yes,,45.0,Woman,,,"Sexual or gender minorities (e.g., LGBTQ2+)",...,,,,,31.0,33.0,33.0,13.0,39.0,No
11427,cscs_11810,1,Yes,Yes,British Columbia,36.0,Man,,,Not Selected,...,,,,,32.0,37.0,31.0,,38.0,No
11428,cscs_11812,3,Yes,,,,,,,,...,,,,,,,,,,No
11429,cscs_11812,3,Yes,,,,,,,,...,,,,,,,,,,Yes


In [48]:
original_df.describe()

Unnamed: 0,UNIQUE_num_records,DEMO_age,WELLNESS_life_satisfaction,GEO_housing_live_with_partner,GEO_housing_live_with_children,GEO_housing_live_with_grandkids,GEO_housing_live_with_parent,GEO_housing_live_with_in_laws,GEO_housing_live_with_siblings,GEO_housing_live_with_roommate,...,PSYCH_body_self_image_questionnaire_attention_to_grooming_score,PSYCH_body_self_image_questionnaire_height_dissatisfaction_score,PSYCH_body_self_image_questionnaire_fatness_evaluation_score,PSYCH_body_self_image_questionnaire_negative_affect_score,PSYCH_body_self_image_questionnaire_social_dependence_score,PSYCH_big_five_inventory_agreeable_score,PSYCH_big_five_inventory_conscientious_score,PSYCH_big_five_inventory_extraverted_score,PSYCH_big_five_inventory_neurotic_score,PSYCH_big_five_inventory_open_score
count,11431.0,10220.0,9599.0,2479.0,2747.0,2614.0,2478.0,2601.0,2600.0,2585.0,...,757.0,759.0,756.0,760.0,759.0,2264.0,2255.0,2280.0,2260.0,2259.0
mean,1.209343,46.811546,6.182206,0.671642,1.099381,0.323259,0.880145,0.678201,0.866154,0.575242,...,8.910172,7.113307,9.271164,7.963158,8.200264,32.269876,31.780488,23.916667,24.522124,39.313856
std,0.607192,18.10625,2.331559,0.469711,1.808359,1.116252,0.937145,1.347884,1.567688,1.449567,...,2.901352,3.775049,3.819332,3.817125,3.268455,5.521504,6.671941,6.282656,7.104042,6.726745
min,1.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,3.0,3.0,3.0,3.0,11.0,9.0,8.0,8.0,10.0
25%,1.0,30.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,3.0,6.0,4.0,6.0,28.0,27.0,20.0,19.0,35.0
50%,1.0,45.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,7.0,9.0,8.0,8.0,32.0,32.0,24.0,25.0,40.0
75%,1.0,62.0,8.0,1.0,2.0,0.0,2.0,1.0,1.0,0.0,...,11.0,10.0,12.25,11.0,11.0,36.0,37.0,28.0,30.0,45.0
max,4.0,96.0,10.0,1.0,10.0,10.0,2.0,10.0,10.0,10.0,...,15.0,15.0,15.0,15.0,15.0,45.0,45.0,40.0,40.0,50.0


In [49]:
print("Number of respondents per question:")
for variable in original_df.columns:
    print(f"{variable}: {original_df[variable].count()}")

Number of respondents per question:
UNIQUE_id: 11431
UNIQUE_num_records: 11431
ELIGIBLE_consent: 11431
GEO_residence_canada: 10236
GEO_province: 7060
DEMO_age: 10220
DEMO_gender: 8014
DEMO_identity_vetrans: 3501
DEMO_identity_indigenous: 3680
DEMO_identity_lgbtq: 7974
DEMO_identity_disability: 3501
DEMO_identity_bipoc: 4438
DEMO_identity_pwud: 3501
DEMO_identity_newcomers: 3501
DEMO_identity_homeless: 3501
DEMO_identity_mental_health: 3501
DEMO_relationship_status: 9469
COVID_prevention_distancing: 8950
COVID_prevention_masks: 8950
COVID_prevention_hand_washing: 8949
COVID_prevention_reduce_people: 8950
COVID_prevention_avoid_trips: 8949
COVID_prevention_household: 8950
COVID_vaccinated: 8950
COVID_vaccinated_two_weeks_since_last_dose: 2656
WELLNESS_life_satisfaction: 9599
WELLNESS_malach_pines_burnout_measure_tired: 4708
WELLNESS_malach_pines_burnout_measure_disappointed: 4707
WELLNESS_malach_pines_burnout_measure_hopeless: 4708
WELLNESS_malach_pines_burnout_measure_trapped: 4707
WELL

In [50]:
ages_df = original_df['DEMO_age']
response_range = original_df['DEMO_age'].unique()
response_range.sort()
na_count = original_df['DEMO_age'].isna().sum()
fig = px.histogram(ages_df)
fig.show()
print(f"{response_range = }")
print(f"{na_count = }")

response_range = array([16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28.,
       29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
       42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54.,
       55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., 66., 67.,
       68., 69., 70., 71., 72., 73., 74., 75., 76., 77., 78., 79., 80.,
       81., 82., 83., 84., 85., 86., 87., 88., 89., 90., 91., 92., 93.,
       96., nan])
na_count = 1211


In [51]:
vaccinated = original_df['COVID_vaccinated']
fig = px.histogram(vaccinated)
fig.show()

In [52]:
vaccinated_and_age = original_df[["COVID_vaccinated", 'DEMO_age']]
vaccinated_and_age = vaccinated_and_age.groupby("DEMO_age")
vaccinated_and_age.describe()

Unnamed: 0_level_0,COVID_vaccinated,COVID_vaccinated,COVID_vaccinated,COVID_vaccinated
Unnamed: 0_level_1,count,unique,top,freq
DEMO_age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
16.0,18,4,No,10
17.0,20,5,No,7
18.0,72,6,No,21
19.0,40,5,"Yes, three or more doses",16
20.0,96,6,"Yes, three or more doses",27
...,...,...,...,...
90.0,5,4,"Yes, three or more doses",2
91.0,6,4,"Yes, three or more doses",2
92.0,2,2,"Yes, five or more doses",1
93.0,1,1,"Yes, three or more doses",1


In [53]:
import plotly.express as px
import pandas as pd

# Assuming original_df is your DataFrame
vaccinated_and_age = original_df[["COVID_vaccinated", 'DEMO_age']]

response_order = ["Presented but no response", "No", "Yes, one dose", "Yes, two doses", "Yes, three or more doses", "Yes, four doses", "Yes, five or more doses",]

# Create the stacked histogram using Plotly Express
fig = px.histogram(
    vaccinated_and_age, 
    x='DEMO_age', 
    color='COVID_vaccinated',  # Group by vaccination status
    barmode='stack',  # Stack the bars
    title="Stacked Histogram of Vaccination by Age Group",
    labels={'COVID_vaccinated': 'Vaccination Status', 'DEMO_age': 'Age Group'},
    category_orders={"COVID_vaccinated": response_order}  # Use dictionary for category_orders
)

# Customize the axes
fig.update_layout(
    xaxis_title="Age Group",
    yaxis_title="Count",
    bargap=0.1,  # Adjusts the gap between bars
)

# Show the plot
fig.show()


In [54]:
# Calculate the counts and percentages
count_df = vaccinated_and_age.value_counts().reset_index(name='count')
total_counts = count_df.groupby('DEMO_age')['count'].transform('sum')
count_df['percentage'] = count_df['count'] / total_counts * 100

response_order = ["Presented but no response", "No", "Yes, one dose", "Yes, two doses", "Yes, three or more doses", "Yes, four doses", "Yes, five or more doses"]

# Create the stacked histogram using Plotly Express with percentages
fig = px.bar(
    count_df,
    x='DEMO_age',
    y='percentage',
    color='COVID_vaccinated',  # Group by vaccination status
    barmode='stack',  # Stack the bars
    title="Stacked Histogram of Vaccination by Age Group (Percentage)",
    labels={'COVID_vaccinated': 'Vaccination Status', 'DEMO_age': 'Age Group'},
    category_orders={"COVID_vaccinated": response_order}  # Use dictionary for category_orders
)

# Customize the axes
fig.update_layout(
    xaxis_title="Age Group",
    yaxis_title="Percentage (%)",
    bargap=0.1,  # Adjusts the gap between bars
    yaxis_tickformat=".1f"  # Format y-axis to show percentage with one decimal
)

# Show the plot
fig.show()
