## Cleaning data, Missing values

In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [None]:
my_series = Series([ 'USA','CANADA', np.nan, 'MEXICO']) # create series
my_series

0       USA
1    CANADA
2       NaN
3    MEXICO
dtype: object

In [None]:
df = DataFrame([['a',1.2,1],[np.nan,3.2,6],['c',np.nan,2],[np.nan,np.nan,np.nan]]) # create dataframe
df

Unnamed: 0,0,1,2
0,a,1.2,1.0
1,,3.2,6.0
2,c,,2.0
3,,,


###Dealing with missing values

####isnull(), notnull()

In [None]:
my_series.isnull() # return True if value is null

0    False
1    False
2     True
3    False
dtype: bool

In [None]:
my_series.notnull() # return True if value is present

0     True
1     True
2    False
3     True
dtype: bool

In [None]:
df.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,True,False,False
2,False,True,False
3,True,True,True


In [None]:
df.notnull()

Unnamed: 0,0,1,2
0,True,True,True
1,False,True,True
2,True,False,True
3,False,False,False


####dropna()

In [None]:
my_series.dropna() # remove the null values

0       USA
1    CANADA
3    MEXICO
dtype: object

In [None]:
clean_df = df.dropna() # removes rows that contain null values
clean_df

Unnamed: 0,0,1,2
0,a,1.2,1.0


In [None]:
df.dropna(how='all') # remove rows that are missing all data

Unnamed: 0,0,1,2
0,a,1.2,1.0
1,,3.2,6.0
2,c,,2.0


In [None]:
df.dropna(axis=1) # drop columns with missing data

0
1
2
3


We can set a threshold for how much data is missing

In [None]:
df.dropna(thresh=2) # remove rows that have less than 2 non-null data points

Unnamed: 0,0,1,2
0,a,1.2,1.0
1,,3.2,6.0
2,c,,2.0


In [None]:
df.dropna(thresh=3) # Droping rows that have less than 3 non-null data points

Unnamed: 0,0,1,2
0,a,1.2,1.0


####fillna()

In [None]:
my_series.fillna('Unknown') # replace missing values in series

0        USA
1     CANADA
2    Unknown
3     MEXICO
dtype: object

In [None]:
df.fillna(1) # replace missing values in dataframe

Unnamed: 0,0,1,2
0,a,1.2,1.0
1,1,3.2,6.0
2,c,1.0,2.0
3,1,1.0,1.0


In [None]:
df.fillna({0:-1, 1:10, 2:1}) # use a dictionary to set different value for each column

Unnamed: 0,0,1,2
0,a,1.2,1.0
1,-1,3.2,6.0
2,c,10.0,2.0
3,-1,10.0,1.0


In [None]:
# Note that we still have access to the original dframe
df.fillna(0)
df

Unnamed: 0,0,1,2
0,a,1.2,1.0
1,,3.2,6.0
2,c,,2.0
3,,,


In [None]:
# If we want to modify the exsisting object, use inplace
df.fillna(0, inplace=True)

# Now let's see the dframe
df

Unnamed: 0,0,1,2
0,a,1.2,1.0
1,0,3.2,6.0
2,c,0.0,2.0
3,0,0.0,0.0


####Removing duplicates

In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [None]:
dup_ser = Series([1,2,3,1,5,7,3])
dup_ser

0    1
1    2
2    3
3    1
4    5
5    7
6    3
dtype: int64

In [None]:
# DataFrame with duplicate rows
duo_df = pd.DataFrame({'C1': ['X', 'Y', 'X', 'X', 'Y', 'Y'] ,
                     'C2': [1, 1, 2, 3, 3, 3]})
duo_df

Unnamed: 0,C1,C2
0,X,1
1,Y,1
2,X,2
3,X,3
4,Y,3
5,Y,3


In [None]:
dup_ser.duplicated() # returns True if a row is duplicate- series

0    False
1    False
2    False
3     True
4    False
5    False
6     True
dtype: bool

In [None]:
duo_df.duplicated() # returns True if a row is duplicate- dataframe

0    False
1    False
2    False
3    False
4    False
5     True
dtype: bool

In [None]:
dup_ser.drop_duplicates() # remove duplicate from series

0    1
1    2
2    3
4    5
5    7
dtype: int64

In [None]:
duo_df.drop_duplicates() # remove duplicate from dataframe

Unnamed: 0,C1,C2
0,X,1
1,Y,1
2,X,2
3,X,3
4,Y,3


In [None]:
duo_df.drop_duplicates(['C1']) # removes duplicates based on specific column

Unnamed: 0,C1,C2
0,X,1
1,Y,1


In [None]:
duo_df.drop_duplicates(['C2']) # removes duplicates based on specific column

Unnamed: 0,C1,C2
0,X,1
2,X,2
3,X,3


In [None]:
duo_df.drop_duplicates(['C1', 'C2'], keep='last') # keep the last duplicate row instead of deafult first row

Unnamed: 0,C1,C2
0,X,1
1,Y,1
2,X,2
3,X,3
5,Y,3


####Replace

In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [None]:
series_data = Series([100, -999999., 300, -999999., -100000., 200]) # df with missing or incorrect values
series_data

0       100.0
1   -999999.0
2       300.0
3   -999999.0
4   -100000.0
5       200.0
dtype: float64

In [None]:
df_data = DataFrame({'A':[1,'',3,np.nan,4],'B':[6,7,9,'',np.nan]})
df_data

Unnamed: 0,A,B
0,1.0,6.0
1,,7.0
2,3.0,9.0
3,,
4,4.0,


In [None]:
series_data.replace(-999999, np.nan) # -999999 seems to indicate missing values

0       100.0
1         NaN
2       300.0
3         NaN
4   -100000.0
5       200.0
dtype: float64

In [None]:
series_data.replace([-999999, -100000], np.nan) # replace multiple values

0    100.0
1      NaN
2    300.0
3      NaN
4      NaN
5    200.0
dtype: float64

In [None]:
df_data.replace([np.nan,''],0) # replace multiple values

Unnamed: 0,A,B
0,1,6
1,0,7
2,3,9
3,0,0
4,4,0


In [None]:
series_data.replace([-999999, -100000], [np.nan, 0]) # replace multiple values with different values

0    100.0
1      NaN
2    300.0
3      NaN
4      0.0
5    200.0
dtype: float64

In [None]:
series_data.replace({-999999 : np.nan, -100000 : 0}) # replace multiple values with different values using dictionary

0    100.0
1      NaN
2    300.0
3      NaN
4      0.0
5    200.0
dtype: float64

In [None]:
df_data

Unnamed: 0,A,B
0,1.0,6.0
1,,7.0
2,3.0,9.0
3,,
4,4.0,


In [None]:
df_data.replace({'A': '', 'B': np.nan}, 0) # replace different values in specific columns

Unnamed: 0,A,B
0,1.0,6.0
1,0.0,7.0
2,3.0,9.0
3,,
4,4.0,0.0


In [None]:
df_data.replace({'A': {np.nan: 0, '': -1}, 'B' : {np.nan : 10, '' : 100}}) # replace different values in specific columns

Unnamed: 0,A,B
0,1,6
1,-1,7
2,3,9
3,0,100
4,4,10


### Merge Data

In [None]:
import pandas as pd
from pandas import Series, DataFrame

In [None]:
left = DataFrame(
     {
         "key": ["K0", "K1", "K2", "K3"],
         "A": ["A0", "A1", "A2", "A3"],
         "B": ["B0", "B1", "B2", "B3"],
     }
 )
left

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,B3


In [None]:
right = DataFrame(
     {
         "key": ["K2", "K3", "K4", "K5"],
         "C": ["C0", "C1", "C2", "C3"],
         "D": ["D0", "D1", "D2", "D3"],
     }
 )
right

Unnamed: 0,key,C,D
0,K2,C0,D0
1,K3,C1,D1
2,K4,C2,D2
3,K5,C3,D3


In [None]:
# merging the dataframes
result = pd.merge(left, right)
result

Unnamed: 0,key,A,B,C,D
0,K2,A2,B2,C0,D0
1,K3,A3,B3,C1,D1


In [None]:
# 'on'- specify according to which common column to merge
result = pd.merge(left, right, on='key')
result

Unnamed: 0,key,A,B,C,D
0,K2,A2,B2,C0,D0
1,K3,A3,B3,C1,D1


In [None]:
# 'how'- specify how to merge (default- inner)
result = pd.merge(left, right, on='key', how='left')
result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,,
1,K1,A1,B1,,
2,K2,A2,B2,C0,D0
3,K3,A3,B3,C1,D1


In [None]:
result = pd.merge(left, right, on='key', how='right')
result

Unnamed: 0,key,A,B,C,D
0,K2,A2,B2,C0,D0
1,K3,A3,B3,C1,D1
2,K4,,,C2,D2
3,K5,,,C3,D3


In [None]:
result = pd.merge(left, right, on='key', how='outer')
result

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,,
1,K1,A1,B1,,
2,K2,A2,B2,C0,D0
3,K3,A3,B3,C1,D1
4,K4,,,C2,D2
5,K5,,,C3,D3


You can merge on multiple columns

In [None]:
left = DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3], "x":[7,5,6]})
left

Unnamed: 0,k,v,x
0,K0,1,7
1,K1,2,5
2,K2,3,6


In [None]:
right = DataFrame({"k": ["K0", "K0", "K3"], "v": [1, 5, 6], "y":[3,4,5]})
right

Unnamed: 0,k,v,y
0,K0,1,3
1,K0,5,4
2,K3,6,5


In [None]:
result = pd.merge(left, right, on=["k","v"])
result

Unnamed: 0,k,v,x,y
0,K0,1,7,3


In [None]:
result = pd.merge(left, right, on=["k","v"], how="outer")
result

Unnamed: 0,k,v,x,y
0,K0,1,7.0,3.0
1,K1,2,5.0,
2,K2,3,6.0,
3,K0,5,,4.0
4,K3,6,,5.0


Add suffix to common column names

In [None]:
result = pd.merge(left, right, on="k") # by default- common column names get the suffixes x and y
result

Unnamed: 0,k,v_x,x,v_y,y
0,K0,1,7,1,3
1,K0,1,7,5,4


In [None]:
result = pd.merge(left, right, on="k",suffixes=("_x","_y"))
result

Unnamed: 0,k,v_x,x,v_y,y
0,K0,1,7,1,3
1,K0,1,7,5,4


Suffixes

In [None]:
left = DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3]})
left

Unnamed: 0,k,v
0,K0,1
1,K1,2
2,K2,3


In [None]:
right = DataFrame({"k": ["K0", "K0", "K3"], "v": [4, 5, 6]})
right

Unnamed: 0,k,v
0,K0,4
1,K0,5
2,K3,6


In [None]:
result = pd.merge(left, right, on="k")
result

Unnamed: 0,k,v_x,v_y
0,K0,1,4
1,K0,1,5


Merge by columns with different names:

In [None]:
left = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
                    'value': [1, 2, 3, 4]})
left

Unnamed: 0,key,value
0,A,1
1,B,2
2,C,3
3,D,4


In [None]:
right = pd.DataFrame({'col1': ['B', 'D', 'E', 'F'],
                    'col2': [5, 6, 7, 8]})
right

Unnamed: 0,col1,col2
0,B,5
1,D,6
2,E,7
3,F,8


In [None]:
result = pd.merge(left, right, left_on='key', right_on='col1')
result

Unnamed: 0,key,value,col1,col2
0,B,2,B,5
1,D,4,D,6


####Join
The join method combines two dataframes on the basis of their indexes

In [None]:
left = pd.DataFrame({'A': [1, 2, 3], 'B': [3, 4, 5]}, index=['row1', 'row2', 'row3'])
left

Unnamed: 0,A,B
row1,1,3
row2,2,4
row3,3,5


In [None]:
right = pd.DataFrame({'C': [5, 6, 7], 'D': [7, 8, 9]}, index=['row1', 'row2', 'row4'])
right

Unnamed: 0,C,D
row1,5,7
row2,6,8
row4,7,9


In [None]:
# join- combine dataframes based on indexes (default- left join)
left.join(right)

Unnamed: 0,A,B,C,D
row1,1,3,5.0,7.0
row2,2,4,6.0,8.0
row3,3,5,,


In [None]:
# inner join
left.join(right, how='inner')

Unnamed: 0,A,B,C,D
row1,1,3,5,7
row2,2,4,6,8


In [None]:
# outer join
left.join(right, how='outer')

Unnamed: 0,A,B,C,D
row1,1.0,3.0,5.0,7.0
row2,2.0,4.0,6.0,8.0
row3,3.0,5.0,,
row4,,,7.0,9.0


### Concatenate

In [None]:
import numpy as np
import pandas as pd

#### Concat Series

In [None]:
cars_prod = pd.Series([7000,3000,4000,5000,4000],index=['Ford','Lexus','Volvo','Chervolet','Kia'])
cars_prod

Ford         7000
Lexus        3000
Volvo        4000
Chervolet    5000
Kia          4000
dtype: int64

In [None]:
new_prod = pd.Series([2000.0], index=['Bentley'])
new_prod

Bentley    2000.0
dtype: float64

In [None]:
pd.concat([cars_prod,new_prod]) # concat vertically

Ford         7000.0
Lexus        3000.0
Volvo        4000.0
Chervolet    5000.0
Kia          4000.0
Bentley      2000.0
dtype: float64

In [None]:
series1 = pd.Series(["Apple", "Banana", "Orange"], name='Fruits')
series1

0     Apple
1    Banana
2    Orange
Name: Fruits, dtype: object

In [None]:
series2 = pd.Series(["Red", "Yellow", "Orange"], name='Colors')
series2

0       Red
1    Yellow
2    Orange
Name: Colors, dtype: object

In [None]:
pd.concat([series1, series2], axis=1) # concatenate horizontally


Unnamed: 0,Fruits,Colors
0,Apple,Red
1,Banana,Yellow
2,Orange,Orange


####Concat dataframes

In [None]:
df1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    },
    index=[0, 1, 2, 3],
)

In [None]:
df2 = pd.DataFrame(
    {
        "A": ["A4", "A5", "A6", "A7"],
        "B": ["B4", "B5", "B6", "B7"],
        "C": ["C4", "C5", "C6", "C7"],
        "D": ["D4", "D5", "D6", "D7"],
    },
    index=[4, 5, 6, 7],
)

In [None]:
pd.concat([df1, df2], axis=1) # concat horizontally

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
4,,,,,A4,B4,C4,D4
5,,,,,A5,B5,C5,D5
6,,,,,A6,B6,C6,D6
7,,,,,A7,B7,C7,D7


In [None]:
pd.concat([df1, df2], axis=0) # concat vertically

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [None]:
df3 = pd.DataFrame(
    {
        "A": ["A8", "A9", "A10", "A11"],
        "C": ["C8", "C9", "C10", "C11"],
        "B": ["B8", "B9", "B10", "B11"],
        "D": ["D8", "D9", "D10", "D11"],
    },
    index=[1, 2, 3, 4],
)

In [None]:
df4 = pd.DataFrame(
    {
        "B": ["B2", "B3", "B6", "B7"],
        "D": ["D2", "D3", "D6", "D7"],
        "F": ["F2", "F3", "F6", "F7"],
    },
    index=[2, 3, 6, 7],
)

In [None]:
result = pd.concat([df3, df4], axis=0)
result

Unnamed: 0,A,C,B,D,F
1,A8,C8,B8,D8,
2,A9,C9,B9,D9,
3,A10,C10,B10,D10,
4,A11,C11,B11,D11,
2,,,B2,D2,F2
3,,,B3,D3,F3
6,,,B6,D6,F6
7,,,B7,D7,F7


In [None]:
result = pd.concat([df3, df4], ignore_index=True, sort=True) # 'ignore_index=True'- reset the index of the result, 'sort=True'- sort the columns
result

Unnamed: 0,A,B,C,D,F
0,A8,B8,C8,D8,
1,A9,B9,C9,D9,
2,A10,B10,C10,D10,
3,A11,B11,C11,D11,
4,,B2,,D2,F2
5,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7
