## Cleaning Non null values

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({
    'Sex': ['M','F','F','D','?'],
    'Age':[29,30,24,290,25]
})

In [4]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


Finding Unique Values

In [5]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [6]:
df['Sex'].value_counts()

Sex
F    2
M    1
D    1
?    1
Name: count, dtype: int64

In [11]:
### Simple replace

df.replace({
    'Sex': {
        '?':'F',
        'D':'M'
    },
    'Age':{
        290:29
    }
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,M,29
4,F,25


In [12]:
### Pythonic way to handle age

df[df['Age'] > 100]

Unnamed: 0,Sex,Age
3,D,290


In [13]:
df.loc[df['Age'] > 100, 'Age'] = df.loc[df['Age'] > 100, 'Age'] /10

In [14]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25


Duplicates

In [15]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany'
],  index = [

    'Gerard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Witing',
    'Peter Ammon',
    'Klaus Scharioth'
])

In [16]:
ambassadors

Gerard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Witing                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [18]:
ambassadors.duplicated()

### When using duplicated, the first instance is not duplicate and subesquent record will be duplicate

Gerard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Witing          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [19]:
ambassadors.duplicated(keep='last')

Gerard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Witing           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [20]:
ambassadors.duplicated(keep=False)

Gerard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Witing           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [21]:
ambassadors.drop_duplicates()

Gerard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Witing                 Germany
dtype: object

In [22]:
ambassadors.drop_duplicates(keep = 'last')

Gerard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [23]:
ambassadors.drop_duplicates(keep=False)

Gerard Araud          France
Armando Varricchio     Italy
dtype: object

Duplicates in DataFrames