In [4]:
import pandas as pd
from scipy.stats import chi2_contingency

In [5]:
df = pd.read_csv('./cleaned_full_survey_data.csv')
profession_df = pd.read_csv("processed-datasets/profession_categories.csv")
df['profession'] = profession_df['Predicted_Category']
df.drop(columns='What is your current major or job field?', inplace=True)
df = df[df['profession'].isin(['Data Science', 'Software Development'])].copy(deep=True)

In [6]:
def chi_tester(df, col1, col2):
    # Create contingency table
    filtered_df = df[(df[col1] != 'Not Applicable') & (df[col2] != 'Not Applicable')]
    table = pd.crosstab(filtered_df[col1], filtered_df[col2])

    if table.size == 0:
        # return that there is no relationship when the table is empty after excluding not applicable records
        return 1
    chi2, p, dof, expected = chi2_contingency(table)

    return p


having too many "not applicable" records in the dataset made the chi square test result in false positives, so I decided to exclude 'not applicable' records from columns I am testing to provide more accurate results

In [7]:
# columns were selected based on the count of unique values there
chi_square_test_columns = [
    "What is your age?",
    "What is your gender?",
    "What is your current role?",
    "How many years of programming experience do you have?",
    "Languages_Used",
    "Python_Community_Support",
    "Java_Community_Support",
    "Which language do you use most frequently?",
    "  Which language do you prefer for the following tasks?   [Data Science & Machine Learning]",
    "  Which language do you prefer for the following tasks?   [Web Development]",
    "  Which language do you prefer for the following tasks?   [Mobile App Development]",
    "  Which language do you prefer for the following tasks?   [Enterprise Applications]",
    "  Which language do you prefer for the following tasks?   [Statistical Analysis]",
    "Which language do you perceive as the most efficient for your tasks?",
    "Python_Execution_Speed",
    "Java_Execution_Speed",
    "Python_Ease_of_Use",
    "Java_Ease_of_Use",
    "Python_Documentation",
    "Java_Documentation",
    "Python_Concurrency_Features",
    "Java_Concurrency_Features",
    "Python_Readability",
    "Java_Readability",
    "Which language do you think will dominate the job market in the next 5 years?",
    "Which language was the easiest for you to learn?",
    "Which language has the most beginner-friendly documentation and learning resources?",
    "In your field, which language is the most commonly used?",
    "Which language do you believe is most in demand for jobs?",
    "Which language would you recommend for someone entering your industry?",
    "profession",
    "Have you ever contributed to an open-source project in any of these languages?"
]
# Convert related_pairs to a DataFrame
related_pairs_df = pd.DataFrame(columns=["col1", "col2", 'p'])

# Add the p-values from correlation_dict to the DataFrame
for i, col1 in enumerate(chi_square_test_columns):
    for col2 in chi_square_test_columns[i+1:]:
        result = chi_tester(df, col1, col2)
        if result < 0.05:
            related_pairs_df = pd.concat([related_pairs_df, pd.DataFrame([[col1, col2, result]], columns=["col1", "col2", "p"])], ignore_index=True)

  related_pairs_df = pd.concat([related_pairs_df, pd.DataFrame([[col1, col2, result]], columns=["col1", "col2", "p"])], ignore_index=True)


In [8]:
related_pairs_df

Unnamed: 0,col1,col2,p
0,What is your age?,What is your current role?,1.139454e-08
1,What is your age?,How many years of programming experience do yo...,0.004681612
2,What is your age?,Which language has the most beginner-friendly ...,0.04123073
3,What is your gender?,Java_Documentation,0.02192916
4,What is your gender?,Have you ever contributed to an open-source pr...,0.01287122
5,What is your current role?,How many years of programming experience do yo...,2.52212e-07
6,What is your current role?,Python_Execution_Speed,0.0009020995
7,How many years of programming experience do yo...,Java_Execution_Speed,0.04035801
8,Languages_Used,profession,1.044393e-20
9,Which language do you use most frequently?,Which language was the easiest for you to learn?,0.004245461


In [9]:
for _, row in related_pairs_df.iterrows():
    col1, col2 = row['col1'], row['col2']
    print(f"Crosstab for '{col1}' and '{col2}':")
    crosstab = pd.crosstab(df[col1], df[col2])
    print(crosstab)
    print("\n")

Crosstab for 'What is your age?' and 'What is your current role?':
What is your current role?  Analyst  Data Scientist  Researcher  Student
What is your age?                                                       
18-24                             1              24           2      254
25-34                             0               1           0        3
45+                               0               0           1        1


Crosstab for 'What is your age?' and 'How many years of programming experience do you have?':
How many years of programming experience do you have?  1-3 years  7+ years  \
What is your age?                                                            
18-24                                                        192         9   
25-34                                                          4         0   
45+                                                            1         1   

How many years of programming experience do you have?  Less than 1 year  
What is

## most noticable relationships

In [10]:
pd.crosstab(df['Languages_Used'], df['profession'])

profession,Data Science,Software Development
Languages_Used,Unnamed: 1_level_1,Unnamed: 2_level_1
Java,2,62
Python,156,67


software developers use each language equally, but java usage in data science is more limited

In [11]:
pd.crosstab(df['Which language was the easiest for you to learn?'], df['Which language do you use most frequently?'])

Which language do you use most frequently?,All equally,Java,Python
Which language was the easiest for you to learn?,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Java,0,1,5
Not Applicable,19,37,194
Python,0,0,29
R,0,1,1


people who thought that python is easy to learn are using it frequently

there is no need to do t-tests on the dataset. why? because t-test is supposed to be applied on features with continuous data. there are numeric features I know. but they do not have much variety in values. they have only 5 unique values {1, 2, 3, 4, 5}, so chi square was a more good fit for them since we can compare the numeric features with categorical features.

In [13]:
import pandas as pd
from scipy.stats import chi2_contingency
import plotly.graph_objects as go
import plotly.io as pio

# Set default template for better aesthetics
pio.templates.default = "plotly_white"

# --- Data Loading and Preprocessing (same as original) ---
df = pd.read_csv('./cleaned_full_survey_data.csv')
profession_df = pd.read_csv("processed-datasets/profession_categories.csv")

# Ensure profession_df has the same index or merge carefully
# Assuming they align row-wise after initial loading
df['profession'] = profession_df['Predicted_Category']

df.drop(columns='What is your current major or job field?', inplace=True)

# Filter for relevant professions
df = df[df['profession'].isin(['Data Science', 'Software Development'])].copy(deep=True)

# --- Chi-Square Test Function (same as original) ---
def chi_tester(df, col1, col2):
    """
    Performs a Chi-Square test for independence on two columns of a DataFrame,
    excluding 'Not Applicable' values.
    """
    # Create contingency table, excluding 'Not Applicable'
    filtered_df = df[(df[col1] != 'Not Applicable') & (df[col2] != 'Not Applicable')]

    # Check if filtered_df is empty or results in a table with zero size
    if filtered_df.empty:
        # Cannot perform test if no data remains after filtering
        return 1.0 # Return a high p-value indicating no relationship could be tested

    # Ensure there is variability left after filtering
    if filtered_df[col1].nunique() <= 1 or filtered_df[col2].nunique() <= 1:
         # Cannot perform test if one or both columns have only one unique value
        return 1.0 # Return a high p-value

    try:
        table = pd.crosstab(filtered_df[col1], filtered_df[col2])

        # chi2_contingency requires all row/column sums to be non-zero.
        # pd.crosstab might create rows/cols with zero counts if a category
        # from the original data didn't appear in the filtered data.
        # The chi2_contingency function often handles this internally,
        # but explicit check can prevent errors on edge cases.
        # Let's rely on chi2_contingency's internal checks for now,
        # as handling sparsity manually is complex and might discard valid data.

        chi2, p, dof, expected = chi2_contingency(table)
        return p
    except ValueError as e:
        # This might happen if the resulting table has dimensions that
        # chi2_contingency cannot handle (e.g., 0xN or Nx0 after internal checks)
        return 1.0 # Return a high p-value if test fails


# --- Columns to Test (same as original) ---
chi_square_test_columns = [
    "What is your age?",
    "What is your gender?",
    "What is your current role?",
    "How many years of programming experience do you have?",
    "Languages_Used",
    "Python_Community_Support",
    "Java_Community_Support",
    "Which language do you use most frequently?",
    "  Which language do you prefer for the following tasks?   [Data Science & Machine Learning]",
    "  Which language do you prefer for the following tasks?   [Web Development]",
    "  Which language do you prefer for the following tasks?   [Mobile App Development]",
    "  Which language do you prefer for the following tasks?   [Enterprise Applications]",
    "  Which language do you prefer for the following tasks?   [Statistical Analysis]",
    "Which language do you perceive as the most efficient for your tasks?",
    "Python_Execution_Speed",
    "Java_Execution_Speed",
    "Python_Ease_of_Use",
    "Java_Ease_of_Use",
    "Python_Documentation",
    "Java_Documentation",
    "Python_Concurrency_Features",
    "Java_Concurrency_Features",
    "Python_Readability",
    "Java_Readability",
    "Which language do you think will dominate the job market in the next 5 years?",
    "Which language was the easiest for you to learn?",
    "Which language has the most beginner-friendly documentation and learning resources?",
    "In your field, which language is the most commonly used?",
    "Which language do you believe is most in demand for jobs?",
    "Which language would you recommend for someone entering your industry?",
    "profession",
    "Have you ever contributed to an open-source project in any of these languages?"
]

# --- Perform Chi-Square Tests and Collect Significant Pairs (same as original) ---
# Convert related_pairs to a DataFrame
related_pairs_df = pd.DataFrame(columns=["col1", "col2", 'p'])

# Add the p-values from correlation_dict to the DataFrame
tested_pairs_count = 0
significant_pairs_count = 0
for i, col1 in enumerate(chi_square_test_columns):
    for col2 in chi_square_test_columns[i+1:]:
        tested_pairs_count += 1
        p_value = chi_tester(df, col1, col2)
        if p_value is not None and p_value < 0.05:
            significant_pairs_count += 1
            related_pairs_df = pd.concat([related_pairs_df, pd.DataFrame([[col1, col2, p_value]], columns=["col1", "col2", "p"])], ignore_index=True)


# --- Visualize Significant Crosstabs using Plotly ---

if related_pairs_df.empty:
    print('')
else:
    for index, row in related_pairs_df.iterrows():
        col1, col2 = row['col1'], row['col2']

        # Create the crosstab table for visualization.
        # We use the full df here (filtered by profession) to show all counts,
        # including 'Not Applicable' if present, mirroring the user's original output format desire.
        # Note: The chi-square test was performed on data *excluding* 'Not Applicable'.
        crosstab = pd.crosstab(df[col1], df[col2])

        if crosstab.empty:
            continue


        # Create a heatmap
        fig = go.Figure(data=go.Heatmap(
                z=crosstab.values,
                x=crosstab.columns.tolist(),
                y=crosstab.index.tolist(),
                colorscale='Viridis', # or any other colorscale like 'Plasma', 'Inferno', 'Blues'
                text=crosstab.values, # Add counts as text labels
                texttemplate="%{text}", # Format text labels
                hovertemplate='<b>%{x}</b><br><b>%{y}</b><br>Count: %{z}<extra></extra>' # Custom hover info
            ))

        # Update layout
        fig.update_layout(
            title=f"Crosstab Heatmap: '{col1}' vs '{col2}'<br>(p={row['p']:.4f})",
            xaxis_title=col2,
            yaxis_title=col1,
            xaxis={'side': 'bottom'}, # Ensure x-axis labels are at the bottom
            margin=dict(l=100, r=100, t=100, b=100), # Add margins
            # Optional: Adjust height/width if needed
            # height=600,
            # width=800,
        )

        # Show the figure
        fig.show()

# Removed the extra print statements for the two specific crosstabs at the end.


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.

