In [76]:
import numpy as np
import pandas as pd

In [77]:
df = pd.DataFrame({'Sex': ['M', 'F', 'F', 'D', '?'],
                  'Age':[29, 30, 204, 25, 34]})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,204
3,D,25
4,?,34


The DataFrame has no missing value, but containing invalid value. 'D', '?' for 'Sex' and '204' for 'Age'. So, we have to fix them:

#### Finding Unique Values

In [78]:
df['Sex'].unique(), df['Sex'].value_counts()

(array(['M', 'F', 'D', '?'], dtype=object),
 F    2
 M    1
 ?    1
 D    1
 Name: Sex, dtype: int64)

Then **replace** invalid value with valid one

In [79]:
df['Sex'].replace('D', 'F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

**replace** also accept dictionary of values to replace

In [80]:
df['Sex'].replace({'D':'F', '?':'M'})

0    M
1    F
2    F
3    F
4    M
Name: Sex, dtype: object

**Or if you have multiple column to replace, use 'DataFrame level' in DataFrame**

In [81]:
df.replace({
    'Sex':{'D':'F', '?':'M'},
    'Age':{204:24}
}, inplace = True)


Or using logical replacement

In [82]:
df.loc[df['Age']>100, 'Age']

Series([], Name: Age, dtype: int64)

In [83]:
df.loc[df['Age']>100,'Age'] = (df.loc[df['Age']>100,'Age']/10).apply(np.ceil) # roundup : .apply(np.ceil), round down: .apply(np.floor)

In [84]:
df

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,24.0
3,F,25.0
4,M,34.0


### Duplicates
There're differences when checking dupplicated values between Series and DataFrame

In [1]:
import pandas as pd
import numpy as np

In [2]:
ambassadors = pd.Series (['France', 'United Kingdom', 'United Kingdom', 'Italy', 'German', 'German', 'German']
                         ,index = ['Gerard Araud', 'Kim Darroch', 
                                   'Peter Westmacott', 'Armando Varricchio', 
                                  'Peter Wittig', 'Peter Ammon', 'Klaus Scharioth'])

In [3]:
ambassadors

Gerard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                  German
Peter Ammon                   German
Klaus Scharioth               German
dtype: object

The two most important methods to deal with duplicates are **dulplicated** and **dropduplicates**
### duplicated(keep = 'first', 'last', False) default : first
* first: mark duplicated as true except for first occurence
* last: mark duplicated as true except for last occurence
* False: mark all duplicated as true

In [20]:
ambassadors.duplicated(),ambassadors.duplicated(keep='last'),ambassadors.duplicated(keep=False)

(Gerard Araud          False
 Kim Darroch           False
 Peter Westmacott       True
 Armando Varricchio    False
 Peter Wittig          False
 Peter Ammon            True
 Klaus Scharioth        True
 dtype: bool,
 Gerard Araud          False
 Kim Darroch            True
 Peter Westmacott      False
 Armando Varricchio    False
 Peter Wittig           True
 Peter Ammon            True
 Klaus Scharioth       False
 dtype: bool,
 Gerard Araud          False
 Kim Darroch            True
 Peter Westmacott       True
 Armando Varricchio    False
 Peter Wittig           True
 Peter Ammon            True
 Klaus Scharioth        True
 dtype: bool)

In [22]:
players = pd.DataFrame({'Name':['Kobe Bryant', 'LeBron James', 'Kobe Bryant', 'Carmelo Anthony', 'Kobe Bryant'],
                       'Pos':['SG', 'SF', 'SG', 'SF', 'SF']})

In [23]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


With DataFrame, you can add **subset** attribute to specify which column that **duplicated()** is applied to

In [27]:
players.duplicated(subset =['Name'], keep ='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

### Splitting Columns
Using **str.split(pat, n, expand)
* pat: specify which part will be used to split
* n: integer, it limit number of splits in output (default -1). None, 0, - 1 will be interpreted as return all splits
* expand: if True- return DataFrame, multiIndex expanding dimensionality, False- return Series/index containing list of strings

In [53]:
df = pd.DataFrame({'Data': ['1987_M_US_1', '1990?_M_UK_1', '1992_F_US_2', '1970?_M_IT_1', '1985_F_I T_2', '1985']})

In [54]:
df['Data'].str.split('_')

0     [1987, M, US, 1]
1    [1990?, M, UK, 1]
2     [1992, F, US, 2]
3    [1970?, M, IT, 1]
4    [1985, F, I T, 2]
5               [1985]
Name: Data, dtype: object

In [56]:
df['Data'].str.split('_',4,expand=True) 
# return 4 parts (columns 0,1,2,3) of splits, =3 return 4(0,1,2) parts of splits
# expand = True , the split value of the last row is 1, so it will extend out into 4 column(equivalent to n) with its values.

Unnamed: 0,0,1,2,3
0,1987,M,US,1.0
1,1990?,M,UK,1.0
2,1992,F,US,2.0
3,1970?,M,IT,1.0
4,1985,F,I T,2.0
5,1985,,,


In [57]:
df= df['Data'].str.split('_',expand=True)

In [60]:
df.columns = ['Year', 'Sex', 'Country', 'Children']
df

Unnamed: 0,Year,Sex,Country,Children
0,1987,M,US,1.0
1,1990?,M,UK,1.0
2,1992,F,US,2.0
3,1970?,M,IT,1.0
4,1985,F,I T,2.0
5,1985,,,


You can also check the column's content with **str.contains** method:

In [64]:
df['Year'].str.contains('1985'), df['Year'].str.contains('\?')
# Use escape character '\' with special character in case it will be contain special meaning in the pattern. Don't need to use with regular character

(0    False
 1    False
 2    False
 3    False
 4     True
 5     True
 Name: Year, dtype: bool,
 0    False
 1     True
 2    False
 3     True
 4    False
 5    False
 Name: Year, dtype: bool)

Removing blanks with **str.strip** (lstrip and rstrip also exist).

Replace character with **str.replace**

In [66]:
df['Country'].str.strip()

0      US
1      UK
2      US
3      IT
4     I T
5    None
Name: Country, dtype: object

In [67]:
df['Country'].str.replace(' ', '')

0      US
1      UK
2      US
3      IT
4      IT
5    None
Name: Country, dtype: object

In [75]:
df['Year'].str.replace(r'(?P<year>\d{4})\?', lambda m:m.group('year'))  # explore more about regex: regular expression

  df['Year'].str.replace(r'(?P<year>\d{4})\?', lambda m:m.group('year'))  # explore more about regex: regular expression


0    1987
1    1990
2    1992
3    1970
4    1985
5    1985
Name: Year, dtype: object

In [73]:
df['Year']

0     1987
1    1990?
2     1992
3    1970?
4     1985
5     1985
Name: Year, dtype: object