In [2]:
import pandas as pd
from scipy.stats import chi2_contingency

In [3]:
df = pd.read_csv('./cleaned_full_survey_data.csv')
profession_df = pd.read_csv("processed-datasets/profession_categories.csv")
df['profession'] = profession_df['Predicted_Category']
df.drop(columns='What is your current major or job field?', inplace=True)
df = df[df['profession'].isin(['Data Science', 'Software Development'])].copy(deep=True)

In [4]:
def chi_tester(df, col1, col2):
    # Create contingency table
    filtered_df = df[(df[col1] != 'Not Applicable') & (df[col2] != 'Not Applicable')]
    table = pd.crosstab(filtered_df[col1], filtered_df[col2])

    # Perform Chi-Square Test
    if table.size == 0:
        print(f"Skipping test for {col1} and {col2} as the contingency table is empty.")
        return 1
    chi2, p, dof, expected = chi2_contingency(table)

    print(f'testing relationship between {col1}" and "{col2}"')
    print(f"Chi2 Statistic: {chi2}")
    print(f"P-value: {p}")

    if p < 0.05:
        print(f"Result: There is a significant relationship.")
    else:
        print("Result: No significant relationship.")
    return p


having too many "not applicable" records in the dataset made the chi square test result in false positives, so I decided to exclude 'not applicable' records from columns I am testing to provide more accurate results

In [5]:
# columns were selected based on the count of unique values there
chi_square_test_columns = [
    "What is your age?",
    "What is your gender?",
    "What is your current role?",
    "How many years of programming experience do you have?",
    "Languages_Used",
    "Python_Community_Support",
    "Java_Community_Support",
    "Which language do you use most frequently?",
    "  Which language do you prefer for the following tasks?   [Data Science & Machine Learning]",
    "  Which language do you prefer for the following tasks?   [Web Development]",
    "  Which language do you prefer for the following tasks?   [Mobile App Development]",
    "  Which language do you prefer for the following tasks?   [Enterprise Applications]",
    "  Which language do you prefer for the following tasks?   [Statistical Analysis]",
    "Which language do you perceive as the most efficient for your tasks?",
    "Python_Execution_Speed",
    "Java_Execution_Speed",
    "Python_Ease_of_Use",
    "Java_Ease_of_Use",
    "Python_Documentation",
    "Java_Documentation",
    "Python_Concurrency_Features",
    "Java_Concurrency_Features",
    "Python_Readability",
    "Java_Readability",
    "Which language do you think will dominate the job market in the next 5 years?",
    "Which language was the easiest for you to learn?",
    "Which language has the most beginner-friendly documentation and learning resources?",
    "In your field, which language is the most commonly used?",
    "Which language do you believe is most in demand for jobs?",
    "Which language would you recommend for someone entering your industry?",
    "profession",
    "Have you ever contributed to an open-source project in any of these languages?"
]
# Convert related_pairs to a DataFrame
related_pairs_df = pd.DataFrame(columns=["col1", "col2", 'p'])

# Add the p-values from correlation_dict to the DataFrame
for i, col1 in enumerate(chi_square_test_columns):
    for col2 in chi_square_test_columns[i+1:]:
        result = chi_tester(df, col1, col2)
        if result < 0.05:
            related_pairs_df = pd.concat([related_pairs_df, pd.DataFrame([[col1, col2, result]], columns=["col1", "col2", "p"])], ignore_index=True)

testing relationship between What is your age?" and "What is your gender?"
Chi2 Statistic: 0.38720582050885316
P-value: 0.8239850353221173
Result: No significant relationship.
testing relationship between What is your age?" and "What is your current role?"
Chi2 Statistic: 48.07897252337995
P-value: 1.1394538258246972e-08
Result: There is a significant relationship.
testing relationship between What is your age?" and "How many years of programming experience do you have?"
Chi2 Statistic: 15.009472153476526
P-value: 0.00468161153652947
Result: There is a significant relationship.
testing relationship between What is your age?" and "Languages_Used"
Chi2 Statistic: 1.7587412029427254
P-value: 0.4150440576097082
Result: No significant relationship.
testing relationship between What is your age?" and "Python_Community_Support"
Chi2 Statistic: 4.62154270209742
P-value: 0.5931844554964274
Result: No significant relationship.
testing relationship between What is your age?" and "Java_Community_S

In [6]:
related_pairs_df

Unnamed: 0,col1,col2,p
0,What is your age?,What is your current role?,1.139454e-08
1,What is your age?,How many years of programming experience do yo...,0.004681612
2,What is your age?,Which language has the most beginner-friendly ...,0.04123073
3,What is your gender?,Java_Documentation,0.02192916
4,What is your gender?,Have you ever contributed to an open-source pr...,0.01287122
5,What is your current role?,How many years of programming experience do yo...,2.52212e-07
6,What is your current role?,Python_Execution_Speed,0.0009020995
7,How many years of programming experience do yo...,Java_Execution_Speed,0.04035801
8,Languages_Used,profession,1.044393e-20
9,Which language do you use most frequently?,Which language was the easiest for you to learn?,0.004245461


In [7]:
for _, row in related_pairs_df.iterrows():
    col1, col2 = row['col1'], row['col2']
    print(f"Crosstab for '{col1}' and '{col2}':")
    crosstab = pd.crosstab(df[col1], df[col2])
    print(crosstab)
    print("\n")

Crosstab for 'What is your age?' and 'What is your current role?':
What is your current role?  Analyst  Data Scientist  Researcher  Student
What is your age?                                                       
18-24                             1              24           2      254
25-34                             0               1           0        3
45+                               0               0           1        1


Crosstab for 'What is your age?' and 'How many years of programming experience do you have?':
How many years of programming experience do you have?  1-3 years  7+ years  \
What is your age?                                                            
18-24                                                        192         9   
25-34                                                          4         0   
45+                                                            1         1   

How many years of programming experience do you have?  Less than 1 year  
What is

## most noticable relationships

In [8]:
pd.crosstab(df['Languages_Used'], df['profession'])

profession,Data Science,Software Development
Languages_Used,Unnamed: 1_level_1,Unnamed: 2_level_1
Java,2,62
Python,156,67


software developers use each language equally, but java usage in data science is more limited

In [9]:
pd.crosstab(df['Which language was the easiest for you to learn?'], df['Which language do you use most frequently?'])

Which language do you use most frequently?,All equally,Java,Python
Which language was the easiest for you to learn?,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Java,0,1,5
Not Applicable,19,37,194
Python,0,0,29
R,0,1,1


people who thought that python is easy to learn are using it frequently

there is no need to do t-tests on the dataset. why? because t-test is supposed to be applied on features with continuous data. there are numeric features I know. but they do not have much variety in values. they have only 5 unique values {1, 2, 3, 4, 5}, so chi square was a more good fit for them since we can compare the numeric features with categorical features.