# **Data-cleaning process using Python pandas library**

In [None]:
# Import library

import pandas as pd

In [None]:
# Read csv file

first_df = pd.read_csv("/content/Data-cleaning-for-beginners-using-pandas.csv")
df = first_df.copy()
df

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,,$77k-$89k,-1.0,"New York,Ny",-1,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1
5,5,44.0,$77k-$89k,1.4,"India,In",1999,TRUE
6,6,21.0,$44k-$99k,0.0,"New York,Ny",-1,-1
7,7,44.0,$44k-$99k,-1.0,Australia Aus,-1,-1
8,8,35.0,$44k-$99k,5.4,"New York,Ny",-1,-1
9,9,22.0,$44k-$99k,7.7,"India,In",-1,TRUE


In [None]:
# Converting column names into lower-case

df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(" ", "_")
df

Unnamed: 0,index,age,salary,rating,location,established,easy_apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,,$77k-$89k,-1.0,"New York,Ny",-1,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1
5,5,44.0,$77k-$89k,1.4,"India,In",1999,TRUE
6,6,21.0,$44k-$99k,0.0,"New York,Ny",-1,-1
7,7,44.0,$44k-$99k,-1.0,Australia Aus,-1,-1
8,8,35.0,$44k-$99k,5.4,"New York,Ny",-1,-1
9,9,22.0,$44k-$99k,7.7,"India,In",-1,TRUE


In [None]:
# Checking number of null values

df.isnull().sum()

index          0
age            7
salary         0
rating         1
location       0
established    0
easy_apply     0
dtype: int64

# *Age Column*

In [None]:
# There are 7 null values in age column so, in order to handle it find average age i.e mean

# Mean
avg_age = df["age"].mean()
avg_age

39.04545454545455

In [None]:
# Filling the missing values with mean

df['age'] = df.age.fillna(avg_age)
df['age'] = df.age.round(decimals = 1)

# Converting float into integer

df['age'] = df['age'].astype(int)
df['age']

0     44
1     66
2     39
3     64
4     25
5     44
6     21
7     44
8     35
9     22
10    55
11    44
12    39
13    25
14    66
15    44
16    19
17    39
18    35
19    32
20    39
21    35
22    19
23    39
24    13
25    55
26    39
27    52
28    39
Name: age, dtype: int64

# *Rating Column*

In [None]:
# There is only 1 null value in rating column so, in order to handle it find mean

avg_rating = df['rating'].mean()
avg_rating

3.528571428571429

In [None]:
# Filling the missing value with mean

df['rating'] = df.rating.fillna(avg_rating)
df['rating'] = df.rating.round(decimals = 1)

# Handle negative rating by replacing -1.0 with 1.0

df['rating'] = df['rating'].replace(-1.0,1.0)
df['rating']

0     5.4
1     3.5
2     1.0
3     4.4
4     6.4
5     1.4
6     0.0
7     1.0
8     5.4
9     7.7
10    5.4
11    6.7
12    0.0
13    1.0
14    4.0
15    3.0
16    4.5
17    5.3
18    6.7
19    3.3
20    5.7
21    5.0
22    7.8
23    2.4
24    1.0
25    0.0
26    3.5
27    5.4
28    3.4
Name: rating, dtype: float64

# *Salary Column*

In [None]:
# Omitting '$' and convering in thousand

df['salary'] = df['salary'].str.replace('$','')
df['salary'] = df['salary'].str.replace('k','000')
df['salary']

  df['salary'] = df['salary'].str.replace('$','')


0      44000-99000
1      55000-66000
2      77000-89000
3      44000-99000
4      44000-99000
5      77000-89000
6      44000-99000
7      44000-99000
8      44000-99000
9      44000-99000
10     10000-49000
11     10000-49000
12     44000-99000
13     44000-99000
14     44000-99000
15    88000-101000
16     19000-40000
17     44000-99000
18     44000-99000
19     44000-99000
20     44000-99000
21     44000-99000
22     55000-66000
23     44000-99000
24     44000-99000
25     44000-99000
26     55000-66000
27     44000-99000
28     39000-88000
Name: salary, dtype: object

In [None]:
# Splitting salary by '-' into starting and ending salary

df[['starting_salary','ending_salary']] = df['salary'].str.split('-',expand = True)

# Deleting entire salary column
df = df.drop(columns= ['salary'])

# *Easy Apply Column*

In [None]:
# Replacing -1 with false

df['easy_apply'] = df['easy_apply'].replace('-1','FALSE')
df['easy_apply']

0      TRUE
1      TRUE
2     FALSE
3     FALSE
4     FALSE
5      TRUE
6     FALSE
7     FALSE
8     FALSE
9      TRUE
10     TRUE
11    FALSE
12    FALSE
13     TRUE
14     TRUE
15    FALSE
16    FALSE
17     TRUE
18     TRUE
19     TRUE
20     TRUE
21    FALSE
22     TRUE
23     TRUE
24    FALSE
25     TRUE
26     TRUE
27    FALSE
28    FALSE
Name: easy_apply, dtype: object

# *Established Column*

In [None]:
# Finding mean to replace -1

avg_est = df['established'].mean()
avg_est

1638.6206896551723

In [None]:
# import library

import numpy as np

In [None]:
df['established'] = df['established'].replace(-1,np.nan)
df['established'] = df['established'].replace(np.nan,avg_est)

# Converting data type into integer

df['established'] = df['established'].astype(int)

In [None]:
df['established']

0     1999
1     2002
2     1638
3     1988
4     2002
5     1999
6     1638
7     1638
8     1638
9     1638
10    2008
11    2009
12    1999
13    2019
14    2020
15    1999
16    1984
17    1943
18    1954
19    1955
20    1944
21    1946
22    1988
23    1999
24    1987
25    1980
26    1934
27    1935
28    1932
Name: established, dtype: int64

# *Location Column*

In [None]:
df['location'].unique()

array(['India,In', 'New York,Ny', 'India In', 'Australia Aus'],
      dtype=object)

In [None]:
df['location'] = df['location'].str.replace(" ",",")
df['location'] = df['location'].str.replace("New,","New ")

In [None]:
# Splitting location by ',' into country and country_ISO_code

df[['country','country_ISO_code']] = df['location'].str.split(",",expand = True)
df

Unnamed: 0,index,age,rating,location,established,easy_apply,starting_salary,ending_salary,country,country_ISO_code
0,0,44,5.4,"India,In",1999,True,44000,99000,India,In
1,1,66,3.5,"New York,Ny",2002,True,55000,66000,New York,Ny
2,2,39,1.0,"New York,Ny",1638,False,77000,89000,New York,Ny
3,3,64,4.4,"India,In",1988,False,44000,99000,India,In
4,4,25,6.4,"Australia,Aus",2002,False,44000,99000,Australia,Aus
5,5,44,1.4,"India,In",1999,True,77000,89000,India,In
6,6,21,0.0,"New York,Ny",1638,False,44000,99000,New York,Ny
7,7,44,1.0,"Australia,Aus",1638,False,44000,99000,Australia,Aus
8,8,35,5.4,"New York,Ny",1638,False,44000,99000,New York,Ny
9,9,22,7.7,"India,In",1638,True,44000,99000,India,In


In [None]:
# After splitting the location column into 2 separted column, delete location column

df = df.drop(columns = ['location'])

In [None]:
# Since there is default numbering so delete index column

df = df.drop(columns = ['index'])

# **Final output**

In [None]:
df

Unnamed: 0,age,rating,established,easy_apply,starting_salary,ending_salary,country,country_ISO_code
0,44,5.4,1999,True,44000,99000,India,In
1,66,3.5,2002,True,55000,66000,New York,Ny
2,39,1.0,1638,False,77000,89000,New York,Ny
3,64,4.4,1988,False,44000,99000,India,In
4,25,6.4,2002,False,44000,99000,Australia,Aus
5,44,1.4,1999,True,77000,89000,India,In
6,21,0.0,1638,False,44000,99000,New York,Ny
7,44,1.0,1638,False,44000,99000,Australia,Aus
8,35,5.4,1638,False,44000,99000,New York,Ny
9,22,7.7,1638,True,44000,99000,India,In
