## Cleaning Non null values

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({
    'Sex': ['M','F','F','D','?'],
    'Age':[29,30,24,290,25]
})

In [4]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


Finding Unique Values

In [5]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [6]:
df['Sex'].value_counts()

Sex
F    2
M    1
D    1
?    1
Name: count, dtype: int64

In [11]:
### Simple replace

df.replace({
    'Sex': {
        '?':'F',
        'D':'M'
    },
    'Age':{
        290:29
    }
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,M,29
4,F,25


In [12]:
### Pythonic way to handle age

df[df['Age'] > 100]

Unnamed: 0,Sex,Age
3,D,290


In [13]:
df.loc[df['Age'] > 100, 'Age'] = df.loc[df['Age'] > 100, 'Age'] /10

In [14]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25


Duplicates

In [15]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany'
],  index = [

    'Gerard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Witing',
    'Peter Ammon',
    'Klaus Scharioth'
])

In [16]:
ambassadors

Gerard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Witing                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [18]:
ambassadors.duplicated()

### When using duplicated, the first instance is not duplicate and subesquent record will be duplicate

Gerard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Witing          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [19]:
ambassadors.duplicated(keep='last')

Gerard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Witing           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [20]:
ambassadors.duplicated(keep=False)

Gerard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Witing           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [21]:
ambassadors.drop_duplicates()

Gerard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Witing                 Germany
dtype: object

In [22]:
ambassadors.drop_duplicates(keep = 'last')

Gerard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [23]:
ambassadors.drop_duplicates(keep=False)

Gerard Araud          France
Armando Varricchio     Italy
dtype: object

Duplicates in DataFrames

In [4]:
players = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'Lebron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    'Pos': [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [5]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,Lebron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [6]:
players.duplicated(subset = ['Name'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [7]:
players.duplicated(subset=['Name'], keep = 'last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [11]:
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,Lebron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


Text Handling

In [12]:
df = pd.DataFrame({
    'Data' : [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_  IT_1',
        '1985_F_I T_2'
    ]
})

In [13]:
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    5 non-null      object
dtypes: object(1)
memory usage: 168.0+ bytes


In [15]:
df['Data'].str.split('_')

0      [1987, M, US , 1]
1      [1990?, M, UK, 1]
2       [1992, F, US, 2]
3    [1970?, M,   IT, 1]
4      [1985, F, I T, 2]
Name: Data, dtype: object

In [16]:
df['Data'].str.split('_', expand=True)

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [17]:
df = df['Data'].str.split('_', expand = True)

In [18]:
df

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [19]:
df.columns = ['Year','Sex','Country','No Children']

In [20]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [21]:
df['Year'].str.contains('/')

0    False
1    False
2    False
3    False
4    False
Name: Year, dtype: bool

In [23]:
df['Year'].str.contains('\?')


0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [24]:
df['Country'].str.replace(' ', '')

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object