In [111]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [5]:
df = pd.DataFrame({
    'Sex' : ['M','F','F','D','?'],
    'Age' : [29,30,24,290,25]
})

df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [7]:
df['Sex'].unique() # this is for finding unique value

array(['M', 'F', 'D', '?'], dtype=object)

In [10]:
df['Sex'].value_counts()   #quick summary of all the unique value

F    2
M    1
D    1
?    1
Name: Sex, dtype: int64

In [11]:
df['Sex'].replace('D','F')   # replace D to F

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [13]:
df['Sex'].replace({'D':'F', '?': 'M'}) # replace more than 1 unique value

0    M
1    F
2    F
3    F
4    M
Name: Sex, dtype: object

In [14]:
df.replace({
    'Sex' : {
        'D' : 'F',
        '?' : 'M'
    },
    'Age' :{
        290 : 90
    }
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,90
4,M,25


In [27]:
df[df['Age'] > 90]

Unnamed: 0,Sex,Age


In [18]:
df.loc[df['Age'] > 90, 'Age'] = df.loc[df['Age'] > 100, 'Age'] /10

In [19]:
df

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,24.0
3,D,29.0
4,?,25.0


Duplicated Data

In [28]:
ambassador = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany'
], index=[
    'Gerard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth'
])

In [31]:
ambassador

Gerard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [32]:
ambassador.duplicated()   # to check the value is duplicated or not by default duplicated is happend in the second value

Gerard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [34]:
ambassador.duplicated(keep='last')   # we change the behaviour of duplicated to be the last one is not value duplicated || it reserve from duplicated function

Gerard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [35]:
ambassador.duplicated(keep=False)    # so behavior of keep = False was if there are more than 1 value of the data then it'll be considered duplicate

Gerard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [38]:
ambassador.drop_duplicates()   #use for drop the duplicates data

Gerard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [39]:
ambassador.drop_duplicates(keep='last')   # reserve function of standard dropduplicates

Gerard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [40]:
ambassador.drop_duplicates(keep=False)  # delete all data that has the same value

Gerard Araud          France
Armando Varricchio     Italy
dtype: object

Duplicates in DataFrame

In [42]:
df2 = pd.DataFrame({
    'Name' : [
        'Kobe Bryant',
        'Lebron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant'
    ],
    'Pos' : [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [43]:
df2

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,Lebron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [44]:
df2.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [45]:
df2.duplicated(subset=['Name'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [46]:
df2.duplicated(subset=['Pos'])

0    False
1    False
2     True
3     True
4     True
dtype: bool

In [47]:
df2.duplicated(subset=['Name'], keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [49]:
df2.duplicated(subset=['Name'],keep=False)

0     True
1    False
2     True
3    False
4     True
dtype: bool

In [50]:
df2.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,Lebron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [52]:
df2.drop_duplicates(keep=False)

Unnamed: 0,Name,Pos
1,Lebron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


Splitting Column

In [86]:
df3 = pd.DataFrame({
    'Data' : [
        '1996_M_ID_1',
        '1998?_F_UK_17',
        '2000_M_US_14',
        '2001?_M_US_11',
        '2002_F_US_9',

    ]
})

In [87]:
df3

Unnamed: 0,Data
0,1996_M_ID_1
1,1998?_F_UK_17
2,2000_M_US_14
3,2001?_M_US_11
4,2002_F_US_9


In [88]:
df3['Data'].str.split('_') # split Colum seperated by Underscore

0      [1996, M, ID, 1]
1    [1998?, F, UK, 17]
2     [2000, M, US, 14]
3    [2001?, M, US, 11]
4      [2002, F, US, 9]
Name: Data, dtype: object

In [89]:
df3['Data'].str.split('_', expand=True)

Unnamed: 0,0,1,2,3
0,1996,M,ID,1
1,1998?,F,UK,17
2,2000,M,US,14
3,2001?,M,US,11
4,2002,F,US,9


In [90]:
df4 = pd.DataFrame(df3['Data'].str.split('_', expand=True))

In [100]:
col_name = ['Years','Gender','Country','Day']
df4.columns = col_name

In [101]:
df4

Unnamed: 0,Years,Gender,Country,Day
0,1996,M,ID,1
1,1998?,F,UK,17
2,2000,M,US,14
3,2001?,M,US,11
4,2002,F,US,9


In [102]:
df4['Years'].str.contains('\?')

0    False
1     True
2    False
3     True
4    False
Name: Years, dtype: bool

In [103]:
df4['Country'].str.contains('U')

0    False
1     True
2     True
3     True
4     True
Name: Country, dtype: bool

In [104]:
df4['Country'].str.strip()   # remove the blank spaces

0    ID
1    UK
2    US
3    US
4    US
Name: Country, dtype: object

In [105]:
df4['Country'].str.replace(' ', '')  # remove the blank spaces

0    ID
1    UK
2    US
3    US
4    US
Name: Country, dtype: object

In [109]:
df4['Years'].str.replace(r'(?P<years>\d{4})\?', lambda m: m.group('years'))    # replace ? using regular expression

  """Entry point for launching an IPython kernel.


0    1996
1    1998
2    2000
3    2001
4    2002
Name: Years, dtype: object