In [245]:
import pandas as pd
import numpy as np
import datetime as dt


people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)  


pd.set_option('display.float_format', lambda x: '%.4f' % x)


survey_source = '/Users/josephyu/Documents/GitHub/data/survey_results_public.csv'
schema_source = '/Users/josephyu/Documents/GitHub/data/survey_results_schema.csv'
dt_source = '/Users/josephyu/Documents/GitHub/data/ETH_1h.csv'


df = pd.read_csv(survey_source, index_col = 'Respondent')
schema_df = pd.read_csv(schema_source, index_col = 'Column')
pp_df = pd.DataFrame(people)

dt_df = pd.read_csv(dt_source)

In [246]:
# 🧭🧭 Two major scenarios of null (na) values
# 1 of 2: deal with True BLANK cells: np.nan or None 处理真-空值
# 2 of 2: deal with Fake Non-Null cells: 'NA', 'n/a', 'missing', ... (pretending in string format) 处理伪装成非空的干扰值

df = pd.DataFrame(people)

df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [247]:
# ✅ Is there any to applymap() to all the BLANK cells E.g. replace ALL Blank Cells with certain strings
# 🧠 df.fillna()

df.fillna('BLANK_CELL')  # inplace=True

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,BLANK_CELL,36
4,BLANK_CELL,BLANK_CELL,BLANK_CELL,BLANK_CELL
5,BLANK_CELL,BLANK_CELL,Anonymous@email.com,BLANK_CELL
6,,Missing,,Missing


In [248]:
# Test if data can be replaced with replace() method

df.replace({
    None: 0,
    'NA': 1,
    'Missing': 2,
}) 

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,1,2,1,2


In [249]:
df.replace({
    'NA': None,
    'Missing': None,
}, inplace=True) 

In [250]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [251]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [252]:
# 🧠 df.dropna(axis=, subset=, how=, )

df.dropna(
    axis=0, 
    subset=['first', 'last', 'email'],
    how='all',
    inplace=True
    )

df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [253]:
# NOTE: type(np.nan) == float astype(float) is the best solution by far! astype(float)为最佳目前为止选择
# 🧠 df[col].astype(float) 200

df['age'] = df['age'].astype(float)

In [254]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   first   4 non-null      object 
 1   last    4 non-null      object 
 2   email   4 non-null      object 
 3   age     4 non-null      float64
dtypes: float64(1), object(3)
memory usage: 200.0+ bytes


In [256]:
df = pd.read_csv(survey_source, index_col = 'Respondent')

df.head(1)

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,OrgSize,DevType,YearsCode,Age1stCode,YearsCodePro,CareerSat,JobSat,MgrIdiot,MgrMoney,MgrWant,JobSeek,LastHireDate,LastInt,FizzBuzz,JobFactors,ResumeUpdate,CurrencySymbol,CurrencyDesc,CompTotal,CompFreq,ConvertedComp,WorkWeekHrs,WorkPlan,WorkChallenge,WorkRemote,WorkLoc,ImpSyn,CodeRev,CodeRevHrs,UnitTests,PurchaseHow,PurchaseWhat,LanguageWorkedWith,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,WebFrameWorkedWith,WebFrameDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,DevEnviron,OpSys,Containers,BlockchainOrg,BlockchainIs,BetterLife,ITperson,OffOn,SocialMedia,Extraversion,ScreenName,SOVisit1st,SOVisitFreq,SOVisitTo,SOFindAnswer,SOTimeSaved,SOHowMuchTime,SOAccount,SOPartFreq,SOJobs,EntTeams,SOComm,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software is about the same,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or tool without taking a formal course",,,4,10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,HTML/CSS;Java;JavaScript;Python,C;C++;C#;Go;HTML/CSS;Java;JavaScript;Python;SQL,SQLite,MySQL,MacOS;Windows,Android;Arduino;Windows,Django;Flask,Flask;jQuery,Node.js,Node.js,IntelliJ;Notepad++;PyCharm,Windows,I do not use containers,,,Yes,"Fortunately, someone else has that title",Yes,Twitter,Online,Username,2017,A few times per month or weekly,Find answers to specific questions;Learn how to do things I didn’t necessarily look for,3-5 times per week,Stack Overflow was much faster,31-60 minutes,No,,"No, I didn't know that Stack Overflow had a job board","No, and I don't know what those are",Neutral,Just as welcome now as I felt last year,Tech articles written by other developers;Industry news about technologies you're interested in;Courses on technologies you're interested in,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult


In [284]:
# TODO: scan the entire the DF for string E.g. 'NA', 'Missing'

# 'NA' -> LastHireDate True
# 🧠 df[].str.contains().any() -> contsin().any()

for col in df.select_dtypes(include=object).columns:
    print(col, df[col].str.contains('NA').any())

MainBranch False
Hobbyist False
OpenSourcer False
OpenSource False
Employment False
Country False
Student False
EdLevel False
UndergradMajor False
EduOther False
OrgSize False
DevType False
YearsCode False
Age1stCode False
YearsCodePro False
CareerSat False
JobSat False
MgrIdiot False
MgrMoney False
MgrWant False
JobSeek False
LastHireDate True
LastInt False
FizzBuzz False
JobFactors False
ResumeUpdate False
CurrencySymbol False
CurrencyDesc False
CompFreq False
WorkPlan False
WorkChallenge False
WorkRemote False
WorkLoc False
ImpSyn False
CodeRev False
UnitTests False
PurchaseHow False
PurchaseWhat False
LanguageWorkedWith False
LanguageDesireNextYear False
DatabaseWorkedWith False
DatabaseDesireNextYear False
PlatformWorkedWith False
PlatformDesireNextYear False
WebFrameWorkedWith False
WebFrameDesireNextYear False
MiscTechWorkedWith False
MiscTechDesireNextYear False
DevEnviron False
OpSys False
Containers False
BlockchainOrg False
BlockchainIs False
BetterLife False
ITperson False


In [285]:
# Create "potential" list of na values to replace strings

na_val_li = ['NA', 'N.A.', 'N/A', 'na', 'n.a', 'n/a', 'NaN', 'MISSING', 'Missing', 'missing']

df = pd.read_csv(survey_source, index_col = 'Respondent', na_values=na_val_li)

In [293]:
df.select_dtypes(include=object).isna().mean().sort_values()

Hobbyist                 0.0000
OpenSourcer              0.0000
Country                  0.0015
MainBranch               0.0062
SOVisitFreq              0.0070
SOComm                   0.0085
SOVisitTo                0.0090
SOJobs                   0.0092
YearsCode                0.0106
OpSys                    0.0116
EntTeams                 0.0117
SOAccount                0.0119
SOFindAnswer             0.0120
Age1stCode               0.0141
LanguageWorkedWith       0.0148
DevEnviron               0.0176
Extraversion             0.0178
Employment               0.0191
ITperson                 0.0196
SurveyEase               0.0203
Student                  0.0210
SurveyLength             0.0214
OpenSource               0.0230
OffOn                    0.0250
EdLevel                  0.0280
SOTimeSaved              0.0286
BetterLife               0.0294
WelcomeChange            0.0341
Gender                   0.0391
Containers               0.0396
SocialMedia              0.0500
EduOther

In [295]:
df.select_dtypes(include=np.number).isna().mean().sort_values()

Age             0.1088
WorkWeekHrs     0.2743
CompTotal       0.3706
ConvertedComp   0.3719
CodeRevHrs      0.4398
dtype: float64

In [305]:
df['YearsCode'].isna().sum()

945

In [304]:
df[df['YearsCode'].isna()]['YearsCode']#.unique()

Respondent
2        NaN
226      NaN
287      NaN
535      NaN
601      NaN
        ... 
88076    NaN
88377    NaN
88601    NaN
88802    NaN
88816    NaN
Name: YearsCode, Length: 945, dtype: object

In [324]:
# df['YearsCode'].value_counts()

# Less than 1 year:0
# More than 50 years:51

df['YearsCode'].replace({
                        'Less than 1 year': None,
                        'More than 50 years': None,
                        }).astype(float).describe()

count   86445.0000
mean       11.7892
std         8.9888
min         1.0000
25%         5.0000
50%         9.0000
75%        16.0000
max        50.0000
Name: YearsCode, dtype: float64

In [325]:
df['YearsCode'].replace({
                        'Less than 1 year': 0,
                        'More than 50 years': 51,
                        }).astype(float).describe()

count   87938.0000
mean       11.6621
std         9.1528
min         0.0000
25%         5.0000
50%         9.0000
75%        15.0000
max        51.0000
Name: YearsCode, dtype: float64