In [2]:
import pandas as pd

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

survey_source = '/Users/josephyu/Documents/GitHub/data/survey_results_public.csv'
schema_source = '/Users/josephyu/Documents/GitHub/data/survey_results_schema.csv'

df = pd.read_csv(survey_source)  # index_col = 'Respondent'
schema_df = pd.read_csv(schema_source, index_col = 'Column')

In [9]:
df['ConvertedComp'].median()

57287.0

In [10]:
df.median()

Respondent       44442.0
CompTotal        62000.0
ConvertedComp    57287.0
WorkWeekHrs         40.0
CodeRevHrs           4.0
Age                 29.0
dtype: float64

In [25]:
row_filter = (df['Age'] > 4) & (df['ConvertedComp'] > 1_000)

df[row_filter].median()

Respondent       44538.0
CompTotal        62000.0
ConvertedComp    57287.0
WorkWeekHrs         40.0
CodeRevHrs           4.0
Age                 30.0
dtype: float64

In [31]:
# Use describe() to get the overall picture on ALL numberica columns
# 🧠 df.describe()

df[row_filter].describe()

Unnamed: 0,Respondent,CompTotal,ConvertedComp,WorkWeekHrs,CodeRevHrs,Age
count,52973.0,52973.0,52973.0,52562.0,39251.0,52973.0
mean,44478.377947,566328000000.0,127889.9,41.885957,4.805498,31.448024
std,25678.008251,75253230000000.0,284243.6,28.951745,4.758644,8.085765
min,3.0,20.0,1005.0,1.0,0.0,13.0
25%,22161.0,20000.0,26868.0,40.0,2.0,26.0
50%,44538.0,62000.0,57287.0,40.0,4.0,30.0
75%,66726.0,120000.0,100000.0,43.0,5.0,35.0
max,88883.0,1e+16,2000000.0,3875.0,99.0,99.0


In [34]:
# Filter with .iloc -> meaningful reference 

df[row_filter].describe().iloc[[1, 3, 5, 7], :]

Unnamed: 0,Respondent,CompTotal,ConvertedComp,WorkWeekHrs,CodeRevHrs,Age
mean,44478.377947,566328000000.0,127889.9,41.885957,4.805498,31.448024
min,3.0,20.0,1005.0,1.0,0.0,13.0
50%,44538.0,62000.0,57287.0,40.0,4.0,30.0
max,88883.0,1e+16,2000000.0,3875.0,99.0,99.0


In [35]:
df[row_filter].describe().iloc[[1, 5], :]

Unnamed: 0,Respondent,CompTotal,ConvertedComp,WorkWeekHrs,CodeRevHrs,Age
mean,44478.377947,566328000000.0,127889.919015,41.885957,4.805498,31.448024
50%,44538.0,62000.0,57287.0,40.0,4.0,30.0


In [37]:
df[row_filter]['ConvertedComp'].count()

52973

In [39]:
df['Hobbyist'].value_counts()

Yes    71257
No     17626
Name: Hobbyist, dtype: int64

In [53]:
df['SocialMedia'].value_counts()

Reddit                      14374
YouTube                     13830
WhatsApp                    13347
Facebook                    13178
Twitter                     11398
Instagram                    6261
I don't use social media     5554
LinkedIn                     4501
WeChat 微信                     667
Snapchat                      628
VK ВКонта́кте                 603
Weibo 新浪微博                     56
Youku Tudou 优酷                 21
Hello                          19
Name: SocialMedia, dtype: int64

In [52]:
schema_df.loc['SocialMedia']

QuestionText    What social media site do you use the most?
Name: SocialMedia, dtype: object

In [58]:
df['SocialMedia'].value_counts(normalize=True)

Reddit                      0.170233
YouTube                     0.163791
WhatsApp                    0.158071
Facebook                    0.156069
Twitter                     0.134988
Instagram                   0.074150
I don't use social media    0.065777
LinkedIn                    0.053306
WeChat 微信                   0.007899
Snapchat                    0.007437
VK ВКонта́кте               0.007141
Weibo 新浪微博                  0.000663
Youku Tudou 优酷              0.000249
Hello                       0.000225
Name: SocialMedia, dtype: float64

In [68]:
# What's the most popular social media site for each country?

df.groupby(['Country']).get_group('India')['SocialMedia'].value_counts().nlargest(3)

WhatsApp    2990
YouTube     1820
LinkedIn     955
Name: SocialMedia, dtype: int64

In [73]:
li_countries = ['United States', 'China', 'India']

for i in li_countries:
    print(df.groupby(['Country']).get_group(i)['SocialMedia'].value_counts().nlargest(3))
    print()


Reddit      5700
Twitter     3468
Facebook    2844
Name: SocialMedia, dtype: int64

WeChat 微信     403
YouTube        53
Weibo 新浪微博     42
Name: SocialMedia, dtype: int64

WhatsApp    2990
YouTube     1820
LinkedIn     955
Name: SocialMedia, dtype: int64



In [104]:
def extract_info(groupby_col, target_col, target_country):
    return df.groupby(groupby_col)[target_col].value_counts(normalize=True).loc[target_country]

In [105]:
extract_info('Country', 'SocialMedia', 'India')

SocialMedia
WhatsApp                    0.342379
YouTube                     0.208405
LinkedIn                    0.109355
Facebook                    0.096301
Instagram                   0.094126
Twitter                     0.062063
Reddit                      0.054162
I don't use social media    0.028627
Snapchat                    0.002634
Hello                       0.000573
WeChat 微信                   0.000573
VK ВКонта́кте               0.000458
Youku Tudou 优酷              0.000229
Weibo 新浪微博                  0.000115
Name: SocialMedia, dtype: float64

In [107]:
extract_info('EdLevel', 'Country', 'Bachelor’s degree (BA, BS, B.Eng., etc.)')

Country
United States                       0.279884
India                               0.142178
United Kingdom                      0.065084
Canada                              0.040579
Germany                             0.039735
                                      ...   
Papua New Guinea                    0.000026
Saint Vincent and the Grenadines    0.000026
San Marino                          0.000026
Sao Tome and Principe               0.000026
Sierra Leone                        0.000026
Name: Country, Length: 166, dtype: float64

In [108]:
extract_info('Country', 'EdLevel', 'United States')

EdLevel
Bachelor’s degree (BA, BS, B.Eng., etc.)                                              0.531570
Master’s degree (MA, MS, M.Eng., MBA, etc.)                                           0.173987
Some college/university study without earning a degree                                0.134870
Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)    0.054016
Associate degree                                                                      0.047416
Other doctoral degree (Ph.D, Ed.D., etc.)                                             0.032662
Primary/elementary school                                                             0.015045
Professional degree (JD, MD, etc.)                                                    0.005775
I never completed any formal education                                                0.004659
Name: EdLevel, dtype: float64

In [114]:
extract_info('Country', 'SocialMedia', 'China')

SocialMedia
WeChat 微信                   0.670549
YouTube                     0.088186
Weibo 新浪微博                  0.069884
I don't use social media    0.044925
Twitter                     0.044925
Reddit                      0.019967
LinkedIn                    0.018303
Facebook                    0.013311
Instagram                   0.011647
Youku Tudou 优酷              0.011647
WhatsApp                    0.004992
VK ВКонта́кте               0.001664
Name: SocialMedia, dtype: float64

In [137]:
df.groupby('Country')['ConvertedComp'].median().sort_values(ascending=False).head(10)

Country
Liechtenstein                       811188.0
San Marino                          301788.0
Timor-Leste                         229500.0
Andorra                             160931.0
Democratic Republic of the Congo    110484.0
United States                       110000.0
Switzerland                          95440.0
Israel                               90720.0
Ireland                              83640.0
Denmark                              82860.0
Name: ConvertedComp, dtype: float64

In [139]:
df.groupby('Country')['ConvertedComp'].median().loc['Denmark']

82860.0

In [144]:
df.groupby('Country')['ConvertedComp'].agg(['median', 'mean'])

Unnamed: 0_level_0,median,mean
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,6222.0,101953.333333
Albania,10818.0,21833.700000
Algeria,7878.0,34924.047619
Andorra,160931.0,160931.000000
Angola,7764.0,7764.000000
...,...,...
"Venezuela, Bolivarian Republic of...",6384.0,14581.627907
Viet Nam,11892.0,17233.436782
Yemen,11940.0,16909.166667
Zambia,5040.0,10075.375000


In [146]:
# Groupby + aggregate multiple functions at once!
# 🧠 df.groupuby()[].agg([func_1, func_2]).loc[] -> df.groupby()[].agg([])

df.groupby('Country')['ConvertedComp'].agg(['median', 'mean']).loc['Germany']

median     63016.000000
mean      109256.884066
Name: Germany, dtype: float64

In [9]:
# 🎯 Cannot mask with non-boolean array containing NA / NaN values
# RIGHT-  we need to deal with NA / NaN values first! - clean up ALL NaN values!
# 必须先清洗数据，取出所有包含NaN的值，然后再做boolean filter，最后统计
# 仅有87569个非空，而实际行数为88883个样本！

# How many people in each country know how to use Python

# row_filter = df['LanguageWorkedWith'].str.contains('Python')

row_filter = df['Country'] == 'China'

df.loc[row_filter, 'LanguageWorkedWith'].str.contains('Python').value_counts(normalize=True)

False    0.534483
True     0.465517
Name: LanguageWorkedWith, dtype: float64

In [19]:
li_countries = ['China', 'India', 'United States']

for country in li_countries:
    row_filter = df['Country'] == country
    print(country)
    print(df.loc[row_filter, 'LanguageWorkedWith'].str.contains('Python').value_counts(normalize=True))
    print()

China
False    0.534483
True     0.465517
Name: LanguageWorkedWith, dtype: float64

India
False    0.648915
True     0.351085
Name: LanguageWorkedWith, dtype: float64

United States
False    0.514517
True     0.485483
Name: LanguageWorkedWith, dtype: float64



In [29]:
# Apply + Lambda 
# 🧠 apply(lambda x: x.str.contains('Python').sum()) -> apply(lambda x: x.func().func())

df.groupby('Country')['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').sum())

Country
Afghanistan                              8
Albania                                 23
Algeria                                 40
Andorra                                  0
Angola                                   2
                                        ..
Venezuela, Bolivarian Republic of...    28
Viet Nam                                78
Yemen                                    3
Zambia                                   4
Zimbabwe                                14
Name: LanguageWorkedWith, Length: 179, dtype: int64

In [41]:
# ✅ Return the percenrage of people in each country knows how to work with Python
df.groupby('Country')['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').value_counts(normalize=True))

Country           
Afghanistan  False    0.794872
             True     0.205128
Albania      False    0.722892
             True     0.277108
Algeria      False    0.682540
                        ...   
Yemen        True     0.176471
Zambia       False    0.666667
             True     0.333333
Zimbabwe     False    0.641026
             True     0.358974
Name: LanguageWorkedWith, Length: 335, dtype: float64

In [46]:
df.groupby('Country')['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').value_counts(normalize=True)).loc[['China', 'India', 'United States']]

Country             
China          False    0.534483
               True     0.465517
India          False    0.648915
               True     0.351085
United States  False    0.514517
               True     0.485483
Name: LanguageWorkedWith, dtype: float64