<a href="https://colab.research.google.com/github/KrishnaPandya-VGEC-IT/Data-Science-/blob/main/Data_Cleaning%5B3_Handling_not_null_values%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame({
    'Sex':['M','F','F','D','?'],
    'Age':[29,30,290,21,66]
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,290
3,D,21
4,?,66


**Finding Unique values**

In [None]:
df['Sex'].unique() # will display unique values, not counts

array(['M', 'F', 'D', '?'], dtype=object)

In [None]:
df['Sex'].value_counts()  #unique values with counts

F    2
M    1
D    1
?    1
Name: Sex, dtype: int64

In [None]:
df['Sex'].replace('D','F') # replace D by F in sex column

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [None]:
df # above methods are immuatable

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,290
3,D,21
4,?,66


In [None]:
df['Sex'].replace({'D':'F','?':'M'}) # replacing two different values

0    M
1    F
2    F
3    F
4    M
Name: Sex, dtype: object

In [None]:
df.replace(
    {
        'Sex':{
            'D':'F',
            '?':'M'
        },
     'Age':{
        290:29
     }
    } 
) # replacing more than one column

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,29
3,F,21
4,M,66


In [None]:
df[df['Age']>100] #list the values whose age > 100

Unnamed: 0,Sex,Age
2,F,290


In [None]:
df.loc[df['Age']>100,'Age'] = df.loc[df['Age']>100,'Age']/10  # changes original value in DataFrame

In [None]:
df

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,29.0
3,D,21.0
4,?,66.0


**Duplicates**

In [None]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'United States',
    'Germany',
    'Germany',
    'Germany'
], index = [
            'one',
            'two',
            'three',
            'four',
            'five',
            'six',
            'seven'
])

In [None]:
ambassadors

one              France
two      United Kingdom
three    United Kingdom
four      United States
five            Germany
six             Germany
seven           Germany
dtype: object

In [None]:
ambassadors.duplicated() # for the first occurance of a given value it will return false otherwise true

one      False
two      False
three     True
four     False
five     False
six       True
seven     True
dtype: bool

In [None]:
ambassadors.duplicated(keep='last') # keep the last occurrence of given value as new value others are duplicated

one      False
two       True
three    False
four     False
five      True
six       True
seven    False
dtype: bool

In [None]:
ambassadors.duplicated(keep= False) #all the duplicated values (Including first occurrence) will be considered as True

one      False
two       True
three     True
four     False
five      True
six       True
seven     True
dtype: bool

In [None]:
ambassadors.drop_duplicates() # state first occurrence of each value

one             France
two     United Kingdom
four     United States
five           Germany
dtype: object

In [None]:
ambassadors.drop_duplicates(keep = 'last') # only keep last values of an occurrence

one              France
three    United Kingdom
four      United States
seven           Germany
dtype: object

In [None]:
ambassadors.drop_duplicates(keep = False) # only state the values which are not at all duplicated (exclude first occurrence also)

one            France
four    United States
dtype: object

**Duplicates in DataFrames**

In [None]:
players = pd.DataFrame({
    'Name':[
            'Kishan Pandya',
            'Aman Patel',
            'Fenil Parmar',
            'Kishan Pandya',
            'Harsh Reddiar'
    ],
    'Pos':[
           'Bowler',
           'Fielder',
           'Bowler',
           'Batsman',
           'Batsman'
    ]
})

In [None]:
players

Unnamed: 0,Name,Pos
0,Kishan Pandya,Bowler
1,Aman Patel,Fielder
2,Fenil Parmar,Bowler
3,Kishan Pandya,Batsman
4,Harsh Reddiar,Batsman


In [None]:
players.duplicated() #considering all parameters

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [None]:
players.duplicated(subset = ['Name']) # in terms of name

0    False
1    False
2    False
3     True
4    False
dtype: bool

In [None]:
players.duplicated(subset = ['Name'], keep = 'last') # keep last occurrence of duplicated names

0     True
1    False
2    False
3    False
4    False
dtype: bool

In [None]:
players.drop_duplicates(subset = 'Name') # drop other occurrences of duplicated values 

Unnamed: 0,Name,Pos
0,Kishan Pandya,Bowler
1,Aman Patel,Fielder
2,Fenil Parmar,Bowler
4,Harsh Reddiar,Batsman


In [None]:
players.drop_duplicates(subset = 'Name' , keep = 'last') # keep last occurrences of duplicated values

Unnamed: 0,Name,Pos
1,Aman Patel,Fielder
2,Fenil Parmar,Bowler
3,Kishan Pandya,Batsman
4,Harsh Reddiar,Batsman


In [None]:
players.drop_duplicates(subset = 'Name' , keep = False)  # drop all the occurrences of duplicated values

Unnamed: 0,Name,Pos
1,Aman Patel,Fielder
2,Fenil Parmar,Bowler
4,Harsh Reddiar,Batsman


**Text handling**

**Splitting Columns**

In [None]:
df = pd.DataFrame({
    'Data': [
             'Kishan_23_M',
             'Aman_28_M',
             'Priyal_33_F',
             'Shruti_37_F'
    ]
})

In [None]:
df

Unnamed: 0,Data
0,Kishan_23_M
1,Aman_28_M
2,Priyal_33_F
3,Shruti_37_F


In [None]:
df['Data'].str.split('_') # Same split as python

0    [Kishan, 23, M]
1      [Aman, 28, M]
2    [Priyal, 33, F]
3    [Shruti, 37, F]
Name: Data, dtype: object

In [None]:
df= df['Data'].str.split('_',expand = True) #Expands it
df

Unnamed: 0,0,1,2
0,Kishan,23,M
1,Aman,28,M
2,Priyal,33,F
3,Shruti,37,F


In [None]:
df.columns = ['Name','Enrollment','Sex'] # Give names to columns

In [None]:
df

Unnamed: 0,Name,Enrollment,Sex
0,Kishan,23,M
1,Aman,28,M
2,Priyal,33,F
3,Shruti,37,F


In [None]:
df['Name'].str.contains('A') # Which name contains A

0    False
1     True
2    False
3    False
Name: Name, dtype: bool

In [None]:
df['Name'].replace({
    'Aman':'Chaman',
    'Kishan':'Mission',
    'Shruti':'Murti',
    'Priyal':'Real'
    }) # Replace all with according value

0    Mission
1     Chaman
2       Real
3      Murti
Name: Name, dtype: object

In [None]:
df['Name'].str.replace('Aman','Amanbhai') # Replace Aman with Amanbhai

0      Kishan
1    Amanbhai
2      Priyal
3      Shruti
Name: Name, dtype: object