In [1]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [4]:
median_salary_year = df['salary_year_avg'].median()

In [5]:
median_salary_hour = df['salary_hour_avg'].median()

# **Filling up NaN values with its column's median value**

In [10]:
# Before we fill in the median values in NaN areas, we need to keep our original data preserved.

df_filled = df

df_filled['salary_year_avg'] = df_filled['salary_year_avg'].fillna(median_salary_year)
df_filled['salary_hour_avg'] = df_filled['salary_hour_avg'].fillna(median_salary_hour)

In [11]:
df_filled.loc[1:5, 'salary_year_avg':'salary_hour_avg']

Unnamed: 0,salary_year_avg,salary_hour_avg
1,115000.0,45.98
2,115000.0,45.98
3,115000.0,45.98
4,115000.0,45.98
5,115000.0,45.98


# **Removing Duplicates**

In [13]:
df_unique = df_filled

df_unique.drop_duplicates()

df_unique = df_unique.drop_duplicates()

# **Checking Data Length after each filtering**

In [17]:
print('Length of Original Data:             ', len(df))
print('Length of Original Filled Data:      ', len(df_filled))
print('Length of Dropped Duplicate Data:    ', len(df_unique))
print('Number of Rows Dropped:              ', len(df)-len(df_unique))

Length of Original Data:              785741
Length of Original Filled Data:       785741
Length of Dropped Duplicate Data:     785640
Number of Rows Dropped:               101


# Dropping More Duplicate values by comparing job_title and company_name; this will detect multiple job postings of the same position that might have been posted on multiple platforms in our data.

In [18]:
df_unique = df_unique.drop_duplicates(subset=['job_title', 'company_name'])

In [22]:
print('Length of Original Data:                     ', len(df))
print('Length of Further Dropped Duplicates Data:   ', len(df_unique))
print('Rows Dropped:                                ', len(df)-len(df_unique))

Length of Original Data:                      785741
Length of Further Dropped Duplicates Data:    508042
Rows Dropped:                                 277699
