In [143]:
import pandas as pd
import numpy as np

people = {
    "first":['Corey', 'Jane', 'John'],
    "last":['Schafer', 'Doe', 'Smith'],
    "email":['CoreySchafer@gmail.com','JaneDoe@hotmail.com','JohnSmith@outlook.com'],
}


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)  


pd.set_option('display.float_format', lambda x: '%.4f' % x)


survey_source = '/Users/josephyu/Documents/GitHub/data/survey_results_public.csv'
schema_source = '/Users/josephyu/Documents/GitHub/data/survey_results_schema.csv'


df = pd.read_csv(survey_source, index_col = 'Respondent')
schema_df = pd.read_csv(schema_source, index_col = 'Column')
pp_df = pd.DataFrame(people)

# ['Hobbyist', 'OpenSourcer', 'Region'] -> PERFECT Data Quality by column

In [144]:
df.columns = df.columns.str.strip()
df.rename(columns={"ConvertedComp": "Salary", "Country": "Region"}, inplace=True)

In [145]:
# participants exceeding "Other Country (Not Listed Above)" as cut-off for Latvia
tgt_regions = df['Region'].value_counts(normalize=True)[:'Latvia'].index.tolist()

In [146]:
filt = df['Region'].isin(tgt_regions)

df = df[filt]

In [147]:
region_dict= {"United States":"US", "United Kingdom":"UK", "Germany":"DE", "France":"FR", "Canada":"CA", "China":"CN", "Taiwan":"TW", "Hong Kong (S.A.R.)":"HK", "India":"IN", "Japan":"JP"}

df['Region'].replace(region_dict, inplace=True)

In [148]:
df['Salary'].describe()

count     53965.0000
mean     129979.1589
std      287698.1569
min           0.0000
25%       27492.0000
50%       58413.0000
75%      101542.0000
max     2000000.0000
Name: Salary, dtype: float64

In [149]:
# Salary US v.s. the rest of the world

filt = df['Region'] == 'US'

df[filt]['Salary'].describe()

count     14981.0000
mean     249546.2546
std      452103.4965
min           0.0000
25%       80000.0000
50%      110000.0000
75%      160000.0000
max     2000000.0000
Name: Salary, dtype: float64

In [166]:
stats_us = df[filt]['Salary'].describe().values.tolist()[-4:-1]

In [167]:
stats_rest = df[~filt]['Salary'].describe().values.tolist()[-4:-1]

In [168]:
[x/y for x, y in zip(stats_us, stats_rest)]


[4.131804565644045, 2.580705705705706, 2.2523790753983897]

count     53965.0000
mean     129979.1589
std      287698.1569
min           0.0000
25%       27492.0000
50%       58413.0000
75%      101542.0000
max     2000000.0000
Name: Salary, dtype: float64

In [226]:
li_reg = []
li_pct = []

for region in df['Region'].unique():
    one_df = df[df['Region'] == region]
    tbd = one_df['Salary']

    li_reg.append(region)
    li_pct.append(round(len(tbd[tbd.values >= 150_000]) / len(one_df['Salary'].dropna()), 4))

pct_df = pd.DataFrame({
    'Regions': li_reg,
    'Pct_over_avg': li_pct
})


pct_df.sort_values(by='Pct_over_avg', ascending=False).reset_index(drop=True)

Unnamed: 0,Regions,Pct_over_avg
0,US,0.2964
1,Ireland,0.2828
2,UK,0.2062
3,Switzerland,0.1711
4,Norway,0.1579
5,Australia,0.154
6,HK,0.1296
7,New Zealand,0.1261
8,Italy,0.115
9,Spain,0.1127


In [155]:
schema_df.loc['MgrIdiot', 'QuestionText']

'How confident are you that your manager knows what they’re doing?'

In [163]:
schema_df.sort_index(ascending=False)

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
YearsCodePro,How many years have you coded professionally (as a part of your work)?
YearsCode,"Including any education, how many years have you been coding?"
WorkWeekHrs,"On average, how many hours per week do you work?"
WorkRemote,How often do you work remotely?
WorkPlan,How structured or planned is your work?
WorkLoc,Where would you prefer to work?
WorkChallenge,"Of these options, what are your greatest challenges to productivity as a developer? Select up to 3:"
WelcomeChange,"Compared to last year, how welcome do you feel on Stack Overflow?"
WebFrameWorkedWith,"Which of the following web frameworks have you done extensive development work in over the past year, and which do you want to work in over the next year? (If you both worked with the framework and want to continue to do so, please check both boxes in that row.)"
WebFrameDesireNextYear,"Which of the following web frameworks have you done extensive development work in over the past year, and which do you want to work in over the next year? (If you both worked with the framework and want to continue to do so, please check both boxes in that row.)"


In [194]:
# df.columns  # LanguageWorkedWith

# 🧠 na=False ️⭐

filt = df['LanguageWorkedWith'].str.contains('Python', na=False)

filt

Respondent
1         True
3        False
4         True
5         True
6        False
         ...  
88182    False
88212     True
88282    False
88377    False
88863    False
Name: LanguageWorkedWith, Length: 85173, dtype: bool

In [201]:
df[filt]['Salary'].describe()[4:-1]

25%    31491.0000
50%    64152.0000
75%   110465.0000
Name: Salary, dtype: float64

In [200]:
df[~filt]['Salary'].describe()[4:-1]

25%   25080.0000
50%   54996.0000
75%   96217.5000
Name: Salary, dtype: float64

In [227]:
pp_df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@hotmail.com
2,John,Smith,JohnSmith@outlook.com


In [245]:
pp_df.columns = [x.upper() for x in pp_df.columns]  # List Comprehension

In [247]:
pp_df.columns

Index(['FIRST', 'LAST', 'EMAIL'], dtype='object')

In [248]:
pp_df.columns = ['FIRST_NAME', 'LAST_NAME', 'EMAIL']

In [257]:
pp_df.columns.str.replace('_', ' ')

Index(['FIRST NAME', 'LAST NAME', 'EMAIL'], dtype='object')

In [258]:
pp_df

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@hotmail.com
2,John,Smith,JohnSmith@outlook.com


In [263]:
pp_df.columns = pp_df.columns.str.title()

In [268]:
pp_df.rename(columns={'Email': 'Email_Address'}, inplace=True)

In [269]:
pp_df.rename(columns={
    'First_Name': 'First_Legal_Name',
    'Last_Name': 'Last_Legal_Name'
})

Unnamed: 0,First_Legal_Name,Last_Legal_Name,Email_Address
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@hotmail.com
2,John,Smith,JohnSmith@outlook.com


In [270]:
pp_df

Unnamed: 0,First_Name,Last_Name,Email_Address
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@hotmail.com
2,John,Smith,JohnSmith@outlook.com


In [272]:
# df.rename(columns={})
# df.columns.str.replace()

df = pp_df

In [282]:
df.loc[:,:]

Unnamed: 0,First_Name,Last_Name,Email_Address
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@hotmail.com
2,John,Smith,JohnSmith@outlook.com


In [284]:
df.loc[2,:]

First_Name                        John
Last_Name                        Smith
Email_Address    JohnSmith@outlook.com
Name: 2, dtype: object

In [293]:
# ✅ Append new row into existing dataframe ✅ 200
# 'First_Name', 'Last_Name', 'Email_Address'
# Congrats! You've probably mastered dict by now!

df.loc[4] = {
    'First_Name':'Joseph',
    'Last_Name':'Yu',
    'Email_Address':'josephyu@outlook.com'
    }

In [297]:
df

Unnamed: 0,First_Name,Last_Name,Email_Address
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@hotmail.com
2,John,Smith,JohnSmith@outlook.com
3,Joseph,Yu,josephyu@outlook.com
5,Joseph,Yu,josephyu@outlook.com
4,Joseph,Yu,josephyu@outlook.com


In [298]:
# ⭐️ Mass create new rows into existing colums 批量添加新行并入已有数据

# df.loc[i] = {dict}

for i in range(11, 21, 1):
    df.loc[i] = {
    'First_Name':'Joseph',
    'Last_Name':'Yu',
    'Email_Address':'josephyu@outlook.com'
    }

In [301]:
df['Email_Address'].str.replace('outlook', 'gmail')

0     CoreySchafer@gmail.com
1        JaneDoe@hotmail.com
2        JohnSmith@gmail.com
3         josephyu@gmail.com
5         josephyu@gmail.com
4         josephyu@gmail.com
11        josephyu@gmail.com
12        josephyu@gmail.com
13        josephyu@gmail.com
14        josephyu@gmail.com
15        josephyu@gmail.com
16        josephyu@gmail.com
17        josephyu@gmail.com
18        josephyu@gmail.com
19        josephyu@gmail.com
20        josephyu@gmail.com
Name: Email_Address, dtype: object

In [302]:
df

Unnamed: 0,First_Name,Last_Name,Email_Address
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@hotmail.com
2,John,Smith,JohnSmith@outlook.com
3,Joseph,Yu,josephyu@outlook.com
5,Joseph,Yu,josephyu@outlook.com
4,Joseph,Yu,josephyu@outlook.com
11,Joseph,Yu,josephyu@outlook.com
12,Joseph,Yu,josephyu@outlook.com
13,Joseph,Yu,josephyu@outlook.com
14,Joseph,Yu,josephyu@outlook.com


In [306]:
# ✅ Similaraly, how do we add new Columns ✅ 200
# Just by adding a new list! EASY mode

df = df.loc[0:2]

In [307]:
df

Unnamed: 0,First_Name,Last_Name,Email_Address
0,Corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@hotmail.com
2,John,Smith,JohnSmith@outlook.com


In [310]:
df['Age'] = [21, 22, 23]

In [312]:
df

Unnamed: 0,First_Name,Last_Name,Email_Address,Age
0,Corey,Schafer,CoreySchafer@gmail.com,21
1,Jane,Doe,JaneDoe@hotmail.com,22
2,John,Smith,JohnSmith@outlook.com,23


In [314]:
filt = (df['Email_Address'].str.contains('gmail')) | (df['First_Name'] == 'Jane')

In [315]:
df[filt]

Unnamed: 0,First_Name,Last_Name,Email_Address,Age
0,Corey,Schafer,CoreySchafer@gmail.com,21
1,Jane,Doe,JaneDoe@hotmail.com,22
