# Data Cleanup

In [38]:
# prepare environment
from datasets import load_dataset
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

In [8]:
# Calculate Median Salary as variable
median_yearly_salary = df['salary_year_avg'].median()
median_hourly_salary = df['salary_hour_avg'].median()
print(f"Median Yearly Salary: {median_yearly_salary}")
print(f"Median Hourly Salary: {median_hourly_salary}")

Median Yearly Salary: 115000.0
Median Hourly Salary: 45.97999954223633


In [19]:
df_filled = df.copy()
df_filled['salary_year_avg'] = df_filled['salary_year_avg'].fillna(median_yearly_salary)
df_filled['salary_hour_avg'] = df_filled['salary_hour_avg'].fillna(median_hourly_salary)
df_filled.loc[:, ['salary_year_avg', 'salary_hour_avg']]

Unnamed: 0,salary_year_avg,salary_hour_avg
0,115000.0,45.98
1,115000.0,45.98
2,115000.0,45.98
3,115000.0,45.98
4,115000.0,45.98
...,...,...
785736,115000.0,45.98
785737,115000.0,45.98
785738,115000.0,45.98
785739,115000.0,45.98


In [None]:
# drop duplicates (entire row duplicated)
df_uniques = df_filled
df_uniques = df_uniques.drop_duplicates()
print(f"Lenght of original df: {len(df_filled)}")
print(f"Lenght of drop duplicates df: {len(df_uniques)}")
print(f"Number of duplicates: {len(df_filled) - len(df_uniques)}")


Lenght of original df: 785741
Lenght of drop duplicates df: 785640
Number of duplicates: 101


In [22]:
# drop duplicates (only job_title and company_name duplicated -> same job posting on different sites or times)
df_uniques = df_uniques.drop_duplicates(subset=['job_title', 'company_name'])
print(f"Lenght of original df: {len(df_filled)}")
print(f"Lenght of drop duplicates df: {len(df_uniques)}")
print(f"Number of duplicates: {len(df_filled) - len(df_uniques)}")

Lenght of original df: 785741
Lenght of drop duplicates df: 508042
Number of duplicates: 277699


In [32]:
# drop na on salary_year_avg
df_notna = df.dropna(subset = ['salary_year_avg'])
print(f"Original rows: {len(df_filled)}")
print(f"Rows after dropna: {len(df_notna)}")
print(f"Number of rows dropped: {len(df_filled) - len(df_notna)}")


Original rows: 785741
Rows after dropna: 22003
Number of rows dropped: 763738


In [41]:
# Remove duplicate rows from the DataFrame based on the job_location column
unique_locations = df.drop_duplicates(subset = ['job_location'])
print(f"Original rows: {len(df)}")
print(f"Rows after dropna: {len(unique_locations)}")
print(f"Number of rows dropped: {len(df) - len(unique_locations)}")

Original rows: 785741
Rows after dropna: 17218
Number of rows dropped: 768523


In [50]:
salary_rate_before = df['salary_rate'].head(10)
df_filled = df.fillna({'salary_rate': 'Unknown'})
salary_rate_after = df_filled['salary_rate'].head(10)
salary_rate_before, salary_rate_after


(0    None
 1    None
 2    None
 3    None
 4    None
 5    None
 6    None
 7    None
 8    None
 9    None
 Name: salary_rate, dtype: object,
 0    Unknown
 1    Unknown
 2    Unknown
 3    Unknown
 4    Unknown
 5    Unknown
 6    Unknown
 7    Unknown
 8    Unknown
 9    Unknown
 Name: salary_rate, dtype: object)