### __Panda Data Wrangling Cleanse__

_Find structure and value errors_

In [11]:
import pandas as pd

measurments = [['Sun', 146, 152],
               ['Moon', 0.36, 0.41],
               ['Mercury', 82, 217],
               ['Venus', 38, 261],
               ['Mars', 56, 401],
               ['Jupiter', 588, 968],
               ['Saturn', 1195, 1660],
               ['Uranus', 2750, 3150],
               ['Neptune', 4300, 4700],
               ['Haley \'s comet', 6, 5400]]

header = ['Celestial bodies', 'MIN', 'MAX']

df = pd.DataFrame(measurments, columns=header)

print(df)

  Celestial bodies      MIN      MAX
0              Sun   146.00   152.00
1             Moon     0.36     0.41
2          Mercury    82.00   217.00
3            Venus    38.00   261.00
4             Mars    56.00   401.00
5          Jupiter   588.00   968.00
6           Saturn  1195.00  1660.00
7           Uranus  2750.00  3150.00
8          Neptune  4300.00  4700.00
9   Haley 's comet     6.00  5400.00


Header names must be cleansed, no spaces and lower cases, reassigning names, otherwise column names wont change.

In [12]:
columns_new = {"Celestial bodies": "celestial_bodies",
               "MIN": "min_distance",
               "MAX": "max_distance"}

df = df.rename(columns=columns_new)
print(df.columns)

Index(['celestial_bodies', 'min_distance', 'max_distance'], dtype='object')


method without reasignment __inplace=True__

In [13]:
measurments = [['Sun', 146, 152],
               ['Moon', 0.36, 0.41],
               ['Mercury', 82, 217],
               ['Venus', 38, 261],
               ['Mars', 56, 401],
               ['Jupiter', 588, 968],
               ['Saturn', 1195, 1660],
               ['Uranus', 2750, 3150],
               ['Neptune', 4300, 4700],
               ['Haley \'s comet', 6, 5400]]

header = ['Celestial bodies', 'MIN', 'MAX']

df = pd.DataFrame(measurments, columns=header)

df.rename(columns = columns_new, inplace=True)
print(df.columns)

Index(['celestial_bodies', 'min_distance', 'max_distance'], dtype='object')


Loops for cleansing

In [15]:
measurments = [['Sun', 146, 152],
               ['Moon', 0.36, 0.41],
               ['Mercury', 82, 217],
               ['Venus', 38, 261],
               ['Mars', 56, 401],
               ['Jupiter', 588, 968],
               ['Saturn', 1195, 1660],
               ['Uranus', 2750, 3150],
               ['Neptune', 4300, 4700],
               ['Haley \'s comet', 6, 5400]]

header = ['Celestial bodies', 'MIN', 'MAX']

df = pd.DataFrame(measurments, columns=header)

new_col_names = []

for old_name in df.columns:
    new_col_names.append(old_name.replace(" ", "_").lower().strip())

df.columns = new_col_names
print(df.columns)

Index(['celestial_bodies', 'min', 'max'], dtype='object')


_Finding null values (None or NaN)_

None - NoneType

Nan - Not a Number

Unspected null values: 0 ? NN n/a or any special char

In [2]:
import pandas as pd

df_cholera = pd.read_csv('DataSets/cholera.csv')
print(df_cholera)


     region                   country  total_cases  imported_cases  deaths  \
0      Азия                Афганистан         33.0             0.0     1.0   
1      Азия                     Индия        385.0             NaN     3.0   
2      Азия                      Иран        634.0           625.0     4.0   
3      Азия                     Йемен    1032481.0             0.0  2261.0   
4      Азия                     Китай         14.0             NaN     0.0   
5      Азия                     Катар          5.0             5.0     0.0   
6      Азия                  Малайзия          2.0             0.0     0.0   
7      Азия                     Непал          7.0             NaN     0.0   
8      Азия                       ОАЭ         12.0            12.0     0.0   
9      Азия         Саудовская Аравия          5.0             5.0     0.0   
10     Азия                  Сингапур          3.0             3.0     0.0   
11     Азия                   Таиланд          8.0             0

In [4]:
print(df_cholera.isna())

    region  country  total_cases  imported_cases  deaths  case_fatality_rate  \
0    False    False        False           False   False               False   
1    False    False        False            True   False               False   
2    False    False        False           False   False               False   
3    False    False        False           False   False               False   
4    False    False        False            True   False               False   
5    False    False        False           False   False               False   
6    False    False        False           False   False               False   
7    False    False        False            True   False               False   
8    False    False        False           False   False               False   
9    False    False        False           False   False               False   
10   False    False        False           False   False               False   
11   False    False        False        

In [6]:
print(df_cholera.isna().sum())

region                 0
country                0
total_cases            1
imported_cases         6
deaths                 1
case_fatality_rate     1
notes                 21
dtype: int64


_Substitute NaN values_

In [8]:
import pandas as pd

df_cholera = pd.read_csv('DataSets/cholera.csv')

df_cholera['imported_cases'] = df_cholera['imported_cases'].fillna(0) # fillna(0) replaces NaN with 0
print(df_cholera)


     region                   country  total_cases  imported_cases  deaths  \
0      Азия                Афганистан         33.0             0.0     1.0   
1      Азия                     Индия        385.0             0.0     3.0   
2      Азия                      Иран        634.0           625.0     4.0   
3      Азия                     Йемен    1032481.0             0.0  2261.0   
4      Азия                     Китай         14.0             0.0     0.0   
5      Азия                     Катар          5.0             5.0     0.0   
6      Азия                  Малайзия          2.0             0.0     0.0   
7      Азия                     Непал          7.0             0.0     0.0   
8      Азия                       ОАЭ         12.0            12.0     0.0   
9      Азия         Саудовская Аравия          5.0             5.0     0.0   
10     Азия                  Сингапур          3.0             3.0     0.0   
11     Азия                   Таиланд          8.0             0