In [345]:
# Data Import on Kaggle
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing processing libraries
import numpy as np
import pandas as pd

# Importing Visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
import palettable.scientific.sequential as palette
import matplotlib.gridspec as gridspec
import warnings
warnings.filterwarnings("ignore")

sns.set_style('white')


In [347]:
df = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')
df = df.iloc[1:]

## Possible Questions

- How do students differ in developed countries vs developing countries?
   - Differences in  in India vs USA
   - Similarities in respondants in India-China vs US-Canada
   - Models used
       - Do particular genders favor certain models (country comparison)?
       - Model used vs Work Industry/Company Size

In [8]:
df.head(5)

## Initial Plots

Let's look at some distributions: 
- Gender distributions
- Distribution plots of some select questions compared with Gender

In [346]:
def gender_split():
    male_df = df[df['Q2'] == 'Man']
    female_df = df[df['Q2'] == 'Woman']
    other_df = df[(df['Q2'] != 'Man') & (df['Q2'] != 'Woman')]
    return male_df, female_df, other_df
male_df, female_df, other_df = gender_split()

fig = plt.figure(figsize=(17, 8))
spec = gridspec.GridSpec(ncols=3, nrows=2, figure=fig)

# Gender Split
ax = fig.add_subplot(spec[0,0])
gender_idx = ['Male', 'Woman', 'Other']
gender_vals = df['Q2'].value_counts().values
gender_vals[2] = gender_vals[2:].sum()

circle = plt.Circle( (0,0), 0.7, color='white')
ax.pie(gender_vals[:3], explode = (0, 0.1, 0.2), labels = gender_idx, colors=palette.Acton_4.hex_colors)
p=plt.gcf()
p.gca().add_artist(circle)
for s in ['top', 'right', 'bottom', 'left']:
    ax.spines[s].set_visible(False)

# Education Split
edu_ser= df['Q4'].value_counts()
edu_ser = edu_ser.reindex(['No formal education past high school','Professional doctorate',
                           'Bachelor’s degree',"Master’s degree",'Doctoral degree',
                           'Some college/university study without earning a bachelor’s degree','I prefer not to answer'])
edu_idx = edu_ser.index
edu_vals = edu_ser.values
edu_idx = ['High School','Doctorate',
                       'Bachelor’s',"Master’s",'Doctoral degree',
                       'Other','No Answer']
ax1 = fig.add_subplot(spec[0,2])
ax1.bar( edu_idx, edu_vals, color=palette.Acton_7.hex_colors)
ax1.set_xticklabels(edu_idx, rotation=40)

for s in ['top', 'right', 'bottom', 'left']:
    ax1.spines[s].set_visible(False)

# Current Role v Gender    
ax1 = fig.add_subplot(spec[1,:])
role_ser= df['Q5'].value_counts()
role_ser = role_ser.reindex(['Developer Relations/Advocacy','Statistician',
                             'Data Engineer','Business Analyst','Research Scientist','Data Analyst','Software Engineer','Student',
              'Data Scientist','Other','Currently not employed','Machine Learning Engineer','Program/Project Manager','Product Manager','DBA/Database Engineer'])
role_idx = role_ser.index
role_vals = role_ser.values
role_idx = ['Developer Relations','Statistician',
                             'Data Engineer','Business Analyst','Research Scientist','Data Analyst','Software Engineer','Student',
              'Data Scientist','Other','Unemployed','ML Engineer','Project Manager','Product Manager','Database Engineer']

ax1.bar( role_idx, role_vals, color=palette.Acton_15.hex_colors)
ax1.set_xticklabels(role_idx, rotation=40)

for s in ['top', 'right', 'bottom', 'left']:
    ax1.spines[s].set_visible(False)


# Text

fig.text(0.28, 0.75, 'Distributions of the respondants', fontsize=17, fontweight='bold', fontfamily='sans-serif')
fig.text(0.28, 0.52, 
'''From the figures below, clearly, most of the repondants are originated
from India and more significantly, the majority are male. We also take a 
quick look at their education backgrounds and their experience. Don't 
worry, we will dive into the correlations between these features soon. 
As mentioned earlier, our analysis here is geared towards comparisons in 
developed vs developing countries so we're looking at a few variables that 
might be important later.
'''
, fontsize=14, fontweight='light', fontfamily='sans-serif')

fig.tight_layout() 
plt.subplots_adjust(wspace=1)
plt.show()






## Country Comparison

Our concern here is that since a majority of the data comes from Indians, our plots earlier could be misleading. Here, we will compare the data from the a developing country (India) against a developed country (USA) and see how the results hold up. Are Kaggle users still majorly "male students"?

In [352]:
display(df.groupby(['Q3']).count()['Q1'].sort_values(ascending=False)[:5])

In [344]:
ind_df = df[df['Q3'] == 'India']
us_df = df[df['Q3'] == 'United States of America']

fig = plt.figure(figsize=(17, 12))
spec = gridspec.GridSpec(ncols=3, nrows=4, figure=fig)

# Gender Split
ax = fig.add_subplot(spec[0,0])
gender_idx = ['Male', 'Woman', 'Other']
gender_vals = ind_df['Q2'].value_counts().values
gender_vals[2] = gender_vals[2:].sum()
circle = plt.Circle( (0,0), 0.7, color='white')
ax.pie(gender_vals[:3], explode = (0, 0.1, 0.2), labels = gender_idx, colors=palette.Acton_4.hex_colors)
p=plt.gcf()
p.gca().add_artist(circle)

# ------------------------

ax2 = fig.add_subplot(spec[1,0])
gender_vals_us = us_df['Q2'].value_counts().values
gender_vals_us[2] = gender_vals_us[2:].sum()
circle = plt.Circle( (0,0), 0.7, color='white')
ax2.pie(gender_vals_us[:3], explode = (0, 0.1, 0.2), labels = gender_idx, colors=palette.Batlow_4.hex_colors)
p=plt.gcf()
p.gca().add_artist(circle)

# ------------------------
# ------------------------

# Education Split
edu_ser= ind_df['Q4'].value_counts()
edu_ser = edu_ser.reindex(['No formal education past high school','Professional doctorate',
                           'Bachelor’s degree',"Master’s degree",'Doctoral degree',
                           'Some college/university study without earning a bachelor’s degree','I prefer not to answer'])
edu_idx = edu_ser.index
edu_vals = edu_ser.values
edu_idx = ['High School','Doctorate',
                       'Bachelor’s',"Master’s",'Doctoral degree',
                       'Other','No Answer']
ax1 = fig.add_subplot(spec[0,2])
ax1.bar( edu_idx, edu_vals, color=palette.Acton_7.hex_colors)
ax1.set_xticklabels(edu_idx, rotation=40)

for s in ['top', 'right', 'bottom', 'left']:
    ax1.spines[s].set_visible(False)


# ------------------------

edu_ser_us= us_df['Q4'].value_counts()
edu_ser_us = edu_ser_us.reindex(['No formal education past high school','Professional doctorate',
                           'Bachelor’s degree',"Master’s degree",'Doctoral degree',
                           'Some college/university study without earning a bachelor’s degree','I prefer not to answer'])
edu_idx_us = edu_ser_us.index
edu_vals_us = edu_ser_us.values
edu_idx_us = ['High School','Doctorate',
                       'Bachelor’s',"Master’s",'Doctoral degree',
                       'Other','No Answer']
ax1 = fig.add_subplot(spec[1,2])
ax1.bar( edu_idx_us, edu_vals_us, color=palette.Batlow_7.hex_colors)
ax1.set_xticklabels(edu_idx_us, rotation=40)

for s in ['top', 'right', 'bottom', 'left']:
    ax1.spines[s].set_visible(False)

# ------------------------
# ------------------------

# Current Role v Gender    
ax1 = fig.add_subplot(spec[2,:])
role_ser= ind_df['Q5'].value_counts()
role_ser = role_ser.reindex(['Developer Relations/Advocacy','Statistician',
                             'Data Engineer','Business Analyst','Research Scientist','Data Analyst','Software Engineer','Student',
              'Data Scientist','Other','Currently not employed','Machine Learning Engineer','Program/Project Manager','Product Manager','DBA/Database Engineer'])
role_idx = role_ser.index
role_vals = role_ser.values
role_idx = ['Developer Relations','Statistician',
                             'Data Engineer','Business Analyst','Research Scientist','Data Analyst','Software Engineer','Student',
              'Data Scientist','Other','Unemployed','ML Engineer','Project Manager','Product Manager','Database Engineer']

ax1.bar( role_idx, role_vals, color=palette.Acton_15.hex_colors)
ax1.set_xticklabels(role_idx, rotation=40)

for s in ['top', 'right', 'bottom', 'left']:
    ax1.spines[s].set_visible(False)
    
    
# ------------------------
ax1 = fig.add_subplot(spec[3,:])
role_ser_us= us_df['Q5'].value_counts()
role_ser_us = role_ser_us.reindex(['Developer Relations/Advocacy','Statistician',
                             'Data Engineer','Business Analyst','Research Scientist','Data Analyst','Software Engineer','Student',
              'Data Scientist','Other','Currently not employed','Machine Learning Engineer','Program/Project Manager','Product Manager','DBA/Database Engineer'])
role_idx_us = role_ser_us.index
role_vals_us = role_ser_us.values
role_idx_us = ['Developer Relations','Statistician',
                             'Data Engineer','Business Analyst','Research Scientist','Data Analyst','Software Engineer','Student',
              'Data Scientist','Other','Unemployed','ML Engineer','Project Manager','Product Manager','Database Engineer']

ax1.bar( role_idx_us, role_vals_us, color=palette.Batlow_15.hex_colors)
ax1.set_xticklabels(role_idx_us, rotation=40)

for s in ['top', 'right', 'bottom', 'left']:
    ax1.spines[s].set_visible(False)


# Text

fig.text(0.28, 0.75, 'Respondant Comparison - India vs USA', fontsize=17, fontweight='bold', fontfamily='sans-serif')
fig.text(0.28, 0.58, 
'''Well, this was a little unexpected. While I did predict differences in 
the distributions, I expected a much bigger difference in the gender gap 
between the two countries.
However, we see that while the gender split remains almost the same (with a
slight increase in women in USA), Kaggle users from the United States seem 
to be more educated (or, specialised). They tend to have a Master's degree 
or a Doctoral degree as compared to the primarily undergraduate students
from India.
This also falls in line with our 
visualisations in the third plot. USA has more users in Kaggle that are 
working professionally.

However, will these results would hold up in other comparisons? Are these
differences common to all developing vs developed countries?
'''
, fontsize=14, fontweight='light', fontfamily='sans-serif')

fig.tight_layout() 
plt.subplots_adjust(wspace=1)
plt.show()