In [1]:
import pandas as pd
import numpy as np

In [19]:
people = {
    'first_name': ['Corey', 'Jane', 'Lee', 'Lee', np.nan, None, "NA"],
    'last_name': ['Chou', 'Kalvin', 'Leo', 'Dan', np.nan, np.nan, "Missing"],
    'email': ['cc@gmail.com', 'jk@gmail.com', 'll@email.com', 'ld@email.com', None, 'anonymous@email.com', np.nan],
    'age': ['33', '35', '64', '19', None, np.nan, 'Missing']
}
df_people = pd.DataFrame(people)

In [9]:
df_people

Unnamed: 0,first_name,last_name,email,age
0,Corey,Chou,cc@gmail.com,33
1,Jane,Kalvin,jk@gmail.com,35
2,Lee,Leo,ll@email.com,64
3,Lee,Dan,ld@email.com,19
4,,,,
5,,,anonymous@email.com,
6,,Missing,,Missing


### Cleaning Data

In [11]:
# delete all rows that any columns has None data
df_people.dropna(axis='index', how='any')

Unnamed: 0,first_name,last_name,email,age
0,Corey,Chou,cc@gmail.com,33
1,Jane,Kalvin,jk@gmail.com,35
2,Lee,Leo,ll@email.com,64
3,Lee,Dan,ld@email.com,19


In [15]:
# deleting rows that all features are None data
df_people.dropna(axis="index", how='any')

Unnamed: 0,first_name,last_name,email,age
0,Corey,Chou,cc@gmail.com,33
1,Jane,Kalvin,jk@gmail.com,35
2,Lee,Leo,ll@email.com,64
3,Lee,Dan,ld@email.com,19


In [17]:
# deleting columns that all rows value are None
df_people.dropna(axis='columns', how='all')

Unnamed: 0,first_name,last_name,email,age
0,Corey,Chou,cc@gmail.com,33
1,Jane,Kalvin,jk@gmail.com,35
2,Lee,Leo,ll@email.com,64
3,Lee,Dan,ld@email.com,19
4,,,,
5,,,anonymous@email.com,
6,,Missing,,Missing


In [21]:
# deleting rows that specific feature is None
df_people.dropna(axis='index', subset='email')

Unnamed: 0,first_name,last_name,email,age
0,Corey,Chou,cc@gmail.com,33.0
1,Jane,Kalvin,jk@gmail.com,35.0
2,Lee,Leo,ll@email.com,64.0
3,Lee,Dan,ld@email.com,19.0
5,,,anonymous@email.com,


In [24]:
# need either first_name or email, but not the both.
df_people.dropna(axis='index', how='all', subset=['last_name', 'email'])

Unnamed: 0,first_name,last_name,email,age
0,Corey,Chou,cc@gmail.com,33
1,Jane,Kalvin,jk@gmail.com,35
2,Lee,Leo,ll@email.com,64
3,Lee,Dan,ld@email.com,19
5,,,anonymous@email.com,
6,,Missing,,Missing


For customized missing value

In [25]:
# dealing with customized missing value using replace.
df_people = pd.DataFrame(people)
df_people.replace('NA', np.nan, inplace=True)
df_people.replace('Missing', np.nan, inplace=True)

In [26]:
df_people

Unnamed: 0,first_name,last_name,email,age
0,Corey,Chou,cc@gmail.com,33.0
1,Jane,Kalvin,jk@gmail.com,35.0
2,Lee,Leo,ll@email.com,64.0
3,Lee,Dan,ld@email.com,19.0
4,,,,
5,,,anonymous@email.com,
6,,,,


In [27]:
# to check which value is None.
df_people.isna()

Unnamed: 0,first_name,last_name,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [28]:
# filling None data with specific values
df_people.fillna('Missing')

Unnamed: 0,first_name,last_name,email,age
0,Corey,Chou,cc@gmail.com,33
1,Jane,Kalvin,jk@gmail.com,35
2,Lee,Leo,ll@email.com,64
3,Lee,Dan,ld@email.com,19
4,Missing,Missing,Missing,Missing
5,Missing,Missing,anonymous@email.com,Missing
6,Missing,Missing,Missing,Missing


In [29]:
df_people.fillna(0)

Unnamed: 0,first_name,last_name,email,age
0,Corey,Chou,cc@gmail.com,33
1,Jane,Kalvin,jk@gmail.com,35
2,Lee,Leo,ll@email.com,64
3,Lee,Dan,ld@email.com,19
4,0,0,0,0
5,0,0,anonymous@email.com,0
6,0,0,0,0


Casting data type

In [30]:
df_people['age'].mean()
# make sure values in the column are numbers not string.

TypeError: can only concatenate str (not "int") to str

In [31]:
type(np.nan)

float

In [33]:
df_people['age'] = df_people['age'].astype(float)

In [34]:
df_people.dtypes

first_name     object
last_name      object
email          object
age           float64
dtype: object

In [35]:
df_people['age'].mean()

37.75

practicing

In [37]:
na_vals = ['NA', 'Missing']
df = pd.read_csv('../../data-2019/survey_results_public.csv', na_values=na_vals)
schema_df = pd.read_csv('../../data-2019/survey_results_schema.csv')

In [38]:
df['YearsCode'].head(10)

0      4
1    NaN
2      3
3      3
4     16
5     13
6      6
7      8
8     12
9     12
Name: YearsCode, dtype: object

In [39]:
df['YearsCode'] = df['YearsCode'].astype(float)

ValueError: could not convert string to float: 'Less than 1 year'

Found values out of expect

In [40]:
df['YearsCode'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [41]:
df['YearsCode'].replace('Less than 1 year', 0, inplace=True)

In [42]:
df['YearsCode'].replace('More than 50 years', 51, inplace=True)

In [43]:
df['YearsCode'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 0, '30', '9', '26', '40', '19', '15', '20', '28',
       '25', '1', '22', '11', '33', '50', '41', '18', '34', '24', '23',
       '42', '27', '21', '36', '32', '39', '38', '31', '37', 51, '29',
       '44', '45', '48', '46', '43', '47', '49'], dtype=object)

In [44]:
df['YearsCode'] = df['YearsCode'].astype(float)