In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv("Data-cleaning-for-beginners-using-pandas.csv")

In [3]:
df.head()

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,,$77k-$89k,-1.0,"New York,Ny",-1,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1


# 1. Missing Values:
Question: Are there any missing values in the dataset, and if so, how should they be handled for each indicator? 



In [4]:
df.isnull().sum()

Index          0
Age            7
Salary         0
Rating         1
Location       0
Established    0
Easy Apply     0
dtype: int64

In [5]:
# Age column has 7 missing values so we can replace these by their mean.

df["Age"].mean()

df["Age"] = df.Age.fillna(df["Age"].mean())

In [6]:
df.head()

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,39.045455,$77k-$89k,-1.0,"New York,Ny",-1,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1


### 

In [7]:
# now "Ratings" Column has  1 missing value so we can replace it by 0 as 
#  this is only one numeric value
df["Rating"] = df["Rating"].fillna(0)
df.head()

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,39.045455,$77k-$89k,-1.0,"New York,Ny",-1,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1


In [8]:
df.isnull().sum()

Index          0
Age            0
Salary         0
Rating         0
Location       0
Established    0
Easy Apply     0
dtype: int64

## 2. Data Types:
Question: What are the data types of each indicator, and do they align with their expected types (e.g., numerical, categorical)?


In [9]:
df.dtypes

Index            int64
Age            float64
Salary          object
Rating         float64
Location        object
Established      int64
Easy Apply      object
dtype: object

In [10]:
#  so we have to remove $ and k from it first.

df["Salary"] = df["Salary"].astype(str)
df["Salary"]= df["Salary"].replace("$"," ")
df["Salary"]= df["Salary"].replace("𝑘"," ")

df.head()

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,39.045455,$77k-$89k,-1.0,"New York,Ny",-1,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1


# 3. Outliers:
Question: Identify potential outliers in numerical indicators (e.g., Age, Salary, Rating). Should outliers be removed or adjusted?


In [11]:
# as we have few negative ratings  so we replace these by positive numbers
df = df.astype({"Age": 'str', "Rating": 'str'}) 
df["Rating"] = df["Rating"].str.lstrip("-")
df.head()


Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,39.04545454545455,$77k-$89k,1.0,"New York,Ny",-1,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1


# 6. Established Column:
Question: Explore the Established column. Are there any inconsistencies or anomalies that need to be addressed?


In [12]:
#  as the established column can't haev values in ngeative so we replace it 
# by " not available"
df = df.astype({"Established": 'str'}) 

df["Established"] = df["Established"].str.lstrip("-")
df["Established"] = df["Established"].replace("1","-")
df.head()

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,39.04545454545455,$77k-$89k,1.0,"New York,Ny",-,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1


# 7. Easy Apply Indicator:
Question: Analyze the Easy Apply column. Does it contain boolean values or need transformation for better analysis?


In [13]:
df = df.astype({"Easy Apply": 'str'}) 
df["Easy Apply"] = df["Easy Apply"].replace("-1"," False")
df.head()

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,True
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,True
2,2,39.04545454545455,$77k-$89k,1.0,"New York,Ny",-,False
3,3,64.0,$44k-$99k,4.4,India In,1988,False
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,False


# 5. Location Standardization:
Question: Check the consistency of location entries. Do they need standardization, and how can this be achieved?


In [14]:
df.Location.unique()
#  so we have a outlier as 'India In', so we have to replace it

array(['India,In', 'New York,Ny', 'India In', 'Australia Aus'],
      dtype=object)

In [15]:
df["Location"]= df["Location"].str.rstrip("In")
df["Location"]= df["Location"].str.rstrip("Aus")
df["Location"]= df["Location"].str.rstrip("Ny,")
df["Location"]= df["Location"].str.rstrip(" ")

df.head()
df.Location.unique()



array(['India', 'New York', 'Australia'], dtype=object)

In [16]:
# now replacing New York with USA as we have country name only
df["Location"]= df["Location"].replace("New York","USA")
df.head()


Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,India,1999,True
1,1,66.0,$55k-$66k,3.5,USA,2002,True
2,2,39.04545454545455,$77k-$89k,1.0,USA,-,False
3,3,64.0,$44k-$99k,4.4,India,1988,False
4,4,25.0,$44k-$99k,6.4,Australia,2002,False


# 4. Salary Formatting:
Question: Examine the format of the Salary column. Does it require any formatting or standardization for consistent analysis?


In [17]:
# we have done salary formating already

# 8. Rating Range:
Question: Investigate the range of values in the Rating column. Does it fall within expected rating scales, and how should outliers be treated?


In [18]:
df["Rating"].unique()

array(['5.4', '3.5', '1.0', '4.4', '6.4', '1.4', '0.0', '7.7', '6.7',
       '4.0', '3.0', '4.5', '5.3', '3.3', '5.7', '5.0', '7.8', '2.4',
       '3.4'], dtype=object)

In [19]:
# I think "Rating " column do not have any outliers now as i assume ratings 
# re out of 10.

# 9. Age Distribution:
Question: Check the distribution of values in the Age column. Are there any unusual entries, and how might they impact analysis?



In [20]:
df["Age"].unique()

array(['44.0', '66.0', '39.04545454545455', '64.0', '25.0', '21.0',
       '35.0', '22.0', '55.0', '19.0', '32.0', '13.0', '52.0'],
      dtype=object)

In [21]:
# we have to optimise 39.04545454545455 value in age column
df["Age"]=df.Age.astype(float).round()
df.head()

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,India,1999,True
1,1,66.0,$55k-$66k,3.5,USA,2002,True
2,2,39.0,$77k-$89k,1.0,USA,-,False
3,3,64.0,$44k-$99k,4.4,India,1988,False
4,4,25.0,$44k-$99k,6.4,Australia,2002,False



## 10. Handling Special Characters:
Question: Examine all text-based columns (e.g., Location). Are there special characters or inconsistencies that need cleaning?


In [22]:
df.head()

# there are no special characters in dataset

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,India,1999,True
1,1,66.0,$55k-$66k,3.5,USA,2002,True
2,2,39.0,$77k-$89k,1.0,USA,-,False
3,3,64.0,$44k-$99k,4.4,India,1988,False
4,4,25.0,$44k-$99k,6.4,Australia,2002,False


## 11. Data Integrity:
Question: Ensure data integrity by cross-referencing entries. For instance, does the Established column align with the Age column?


In [23]:
#  done already


## 12. Easy Apply Transformation:
Question: If the Easy Apply column contains non-boolean values, how can it be transformed into a usable format?



In [24]:
df["Easy Apply"]

0       TRUE
1       TRUE
2      False
3      False
4      False
5       TRUE
6      False
7      False
8      False
9       TRUE
10      TRUE
11     False
12     False
13      TRUE
14      TRUE
15     False
16     False
17      TRUE
18      TRUE
19      TRUE
20      TRUE
21     False
22      TRUE
23      TRUE
24     False
25      TRUE
26      TRUE
27     False
28     False
Name: Easy Apply, dtype: object

In [25]:
# There are no non-boolean values in easy apply column

## 13. Location Accuracy:
Question: Assess the accuracy of location entries. Are there misspelled or ambiguous locations that require correction?



In [26]:
df["Location"].unique()
# There are no misspelled or ambiguous locations that require correction.

array(['India', 'USA', 'Australia'], dtype=object)

 ## 14. Handling Categorical Data:
Question: For categorical indicators, consider encoding or transforming them into a format suitable for analysis.


In [27]:
df.head()
# we have already formated categoricaal data

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,India,1999,True
1,1,66.0,$55k-$66k,3.5,USA,2002,True
2,2,39.0,$77k-$89k,1.0,USA,-,False
3,3,64.0,$44k-$99k,4.4,India,1988,False
4,4,25.0,$44k-$99k,6.4,Australia,2002,False


## 15. Consistent Rating Scale:
Question: Ensure a consistent rating scale in the Rating column. Should it be normalized or adjusted for uniform analysis?


In [28]:
# we have already done that

## Final data set 

In [29]:
df

Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,India,1999,True
1,1,66.0,$55k-$66k,3.5,USA,2002,True
2,2,39.0,$77k-$89k,1.0,USA,-,False
3,3,64.0,$44k-$99k,4.4,India,1988,False
4,4,25.0,$44k-$99k,6.4,Australia,2002,False
5,5,44.0,$77k-$89k,1.4,India,1999,True
6,6,21.0,$44k-$99k,0.0,USA,-,False
7,7,44.0,$44k-$99k,1.0,Australia,-,False
8,8,35.0,$44k-$99k,5.4,USA,-,False
9,9,22.0,$44k-$99k,7.7,India,-,True
