In [58]:
import numpy as np
import pandas as pd

people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [59]:
df = pd.DataFrame(people)

df.columns = df.columns.str.title()

In [60]:
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)
# df.replace(None, np.nan, inplace=True)

In [61]:
# 🎯 What's the difference between None vs. np.nan?
# ✅ Are they both treated as isnull().mean() - YES. They are

df

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [62]:
# Count the percentage % of missing values -> isnull()
# 🧠 df.isnull().mean()

(df.isnull().mean() * 100).round(4)

First    42.8571
Last     42.8571
Email    42.8571
Age      42.8571
dtype: float64

In [63]:
# Check the entire data frame for na values
# 🧠 df.isna()

df.isna()

Unnamed: 0,First,Last,Email,Age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [64]:
# 🧠 df.dropna()

df.dropna(axis='index', how='any')

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [65]:
# Mass fill missing data with string or integer ('MISSING', 0)
# 🧠 df.fillna()

df.fillna('MISSING_Input')

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING_Input,36
4,MISSING_Input,MISSING_Input,MISSING_Input,MISSING_Input
5,MISSING_Input,MISSING_Input,Anonymous@email.com,MISSING_Input
6,MISSING_Input,MISSING_Input,MISSING_Input,MISSING_Input


In [66]:
df.dropna(axis='columns', how='any')

0
1
2
3
4
5
6


In [67]:
df.dropna(axis='index', how='all')  # Drop row ONLY if the entire row is missing contents

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [68]:
df.dropna(axis='columns', how='all')

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


First    28.5714
Last     28.5714
Email    28.5714
Age      28.5714
dtype: float64

In [69]:
# Drop a specific column with condition
df.dropna(axis='index', how='any', subset=['Email'])

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
5,,,Anonymous@email.com,


In [72]:
df.dropna(axis='index', how='all', subset=['Email', 'Last'])  # .loc[6, 'Age']

Unnamed: 0,First,Last,Email,Age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [74]:
# Convert data types 
# 1st check data type -> 

df.dtypes

First    object
Last     object
Email    object
Age      object
dtype: object

In [80]:
# 🧠 np.nan is a float

type(np.nan)

float

In [81]:
df['Age'] = df['Age'].astype(float)

In [82]:
df.dtypes

First     object
Last      object
Email     object
Age      float64
dtype: object

In [108]:
df['Age'].mean()

30.336698649160457

In [109]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

survey_source = '/Users/josephyu/Documents/GitHub/data/survey_results_public.csv'
schema_source = '/Users/josephyu/Documents/GitHub/data/survey_results_schema.csv'

# Pass in NA values into a list
na_values = ['NA', 'Missing']

df = pd.read_csv(survey_source, na_values=na_values)  # index_col = 'Respondent'
schema_df = pd.read_csv(schema_source, index_col = 'Column')

In [110]:
# 🎯 isna vs. isnull what's the difference?

df.isna().mean().sort_values(ascending=False)

BlockchainOrg             0.457995
CodeRevHrs                0.439825
ConvertedComp             0.371950
CompTotal                 0.370577
MiscTechWorkedWith        0.329613
BlockchainIs              0.323099
PurchaseHow               0.312489
MgrMoney                  0.311938
MgrIdiot                  0.311916
MgrWant                   0.311094
PurchaseWhat              0.302128
UnitTests                 0.294938
WebFrameDesireNextYear    0.291833
CompFreq                  0.288188
WorkWeekHrs               0.274293
MiscTechDesireNextYear    0.274203
WebFrameWorkedWith        0.268454
LastInt                   0.244456
WorkChallenge             0.233363
SOHowMuchTime             0.230697
WorkPlan                  0.224666
DatabaseDesireNextYear    0.222045
SONewContent              0.217398
WorkLoc                   0.211829
WorkRemote                0.209253
CodeRev                   0.208060
JobSat                    0.201332
FizzBuzz                  0.197327
CurrencySymbol      

In [111]:
# YearsCode, YearsCodePro -> object to float/int
# Less than 1 year, More than 50 years
# df['YearsCodePro'].value_counts()

df['YearsCode'].replace({'Less than 1 year':'0', 'More than 50 years':'51'}, inplace=True)
df['YearsCodePro'].replace({'Less than 1 year':'0', 'More than 50 years':'51'}, inplace=True)

In [114]:
df['YearsCode'].unique()
df['YearsCodePro'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', '0', '30', '9', '26', '40', '19', '15', '20',
       '28', '25', '1', '22', '11', '33', '50', '41', '18', '34', '24',
       '23', '42', '27', '21', '36', '32', '39', '38', '31', '37', '51',
       '29', '44', '45', '48', '46', '43', '47', '49'], dtype=object)

In [115]:
df['YearsCodePro'].unique()

array([nan, '1', '0', '9', '3', '4', '10', '8', '2', '13', '18', '5',
       '14', '22', '23', '19', '35', '20', '25', '7', '15', '27', '6',
       '48', '12', '31', '11', '17', '16', '21', '29', '30', '26', '33',
       '28', '37', '40', '34', '24', '39', '38', '36', '32', '41', '45',
       '43', '51', '44', '42', '46', '49', '50', '47'], dtype=object)

In [121]:
# Manually reset data type from string into float

df['YearsCode'] = df['YearsCode'].astype(float)
df['YearsCodePro'] = df['YearsCodePro'].astype(float)

In [122]:
df['YearsCode'].mean()

11.662114216834588

In [123]:
df['YearsCodePro'].mean()


8.15634123044221