In [1]:
import pandas as pd

df = pd.read_csv('../data/raw/jobs_raw.csv')
print(df.isnull().sum())


job_id                             0
company_name                    1719
title                              0
description                        7
max_salary                     94056
pay_period                     87776
location                           0
company_id                      1717
views                           1689
med_salary                    117569
min_salary                     94056
formatted_work_type                0
applies                       100529
original_listed_time               0
remote_allowed                108603
job_posting_url                    0
application_url                36665
application_type                   0
expiry                             0
closed_time                   122776
formatted_experience_level     29409
skills_desc                   121410
listed_time                        0
posting_domain                 39968
sponsored                          0
work_type                          0
currency                       87776
c

In [8]:
numeric_cols = ['min_salary', 'max_salary', 'views', 'applies']

for col in numeric_cols:
    med = df[col].median()
    # replace fillna(inplace=True) with an assignment
    df[col] = df[col].fillna(med)
    print(f"After filling, {col} nulls left:", df[col].isnull().sum())


After filling, min_salary nulls left: 0
After filling, max_salary nulls left: 0
After filling, views nulls left: 0
After filling, applies nulls left: 0


In [2]:
df = df.dropna(subset=['listed_time', 'title', 'description'])
print(df.isnull().sum())


job_id                             0
company_name                    1718
title                              0
description                        0
max_salary                     94050
pay_period                     87770
location                           0
company_id                      1716
views                           1689
med_salary                    117562
min_salary                     94050
formatted_work_type                0
applies                       100524
original_listed_time               0
remote_allowed                108599
job_posting_url                    0
application_url                36658
application_type                   0
expiry                             0
closed_time                   122771
formatted_experience_level     29402
skills_desc                   121403
listed_time                        0
posting_domain                 39961
sponsored                          0
work_type                          0
currency                       87770
c

In [14]:
for col in ['posting_domain', 'compensation_type', 'currency']:
    df[col] = df[col].fillna('Unknown')
    print(f"After filling, {col} nulls left:", df[col].isnull().sum())
   



After filling, posting_domain nulls left: 0
After filling, compensation_type nulls left: 0
After filling, currency nulls left: 0


In [15]:
# 1. Fill pay_period
df['pay_period'] = df['pay_period'].fillna('Unknown')
print("pay_period nulls left:", df['pay_period'].isnull().sum())

# 2. Median fill for med_salary & normalized_salary
for col in ['med_salary', 'normalized_salary']:
    med = df[col].median()
    df[col] = df[col].fillna(med)
    print(f"{col} nulls left:", df[col].isnull().sum())

# 3. Drop columns we won’t use
df.drop(columns=['closed_time', 'skills_desc'], inplace=True)
print("Dropped closed_time & skills_desc")

# 4. Mode fill for experience level
mode_exp = df['formatted_experience_level'].mode()[0]
df['formatted_experience_level'] = df['formatted_experience_level'].fillna(mode_exp)
print("formatted_experience_level nulls left:", df['formatted_experience_level'].isnull().sum())

# 5. Fill zip_code & fips
df['zip_code'] = df['zip_code'].fillna('Unknown')
df['fips']     = df['fips'].fillna('Unknown')
print("zip_code nulls left:", df['zip_code'].isnull().sum())
print("fips nulls left:", df['fips'].isnull().sum())

# 6. Final null‑check
print("\nFinal null counts:\n", df.isnull().sum())


pay_period nulls left: 0
med_salary nulls left: 0
normalized_salary nulls left: 0
Dropped closed_time & skills_desc
formatted_experience_level nulls left: 0
zip_code nulls left: 0
fips nulls left: 0

Final null counts:
 job_id                            0
company_name                      0
title                             0
description                       0
max_salary                        0
pay_period                        0
location                          0
company_id                      115
views                             0
med_salary                        0
min_salary                        0
formatted_work_type               0
applies                           0
original_listed_time              0
remote_allowed                79374
job_posting_url                   0
application_url                   0
application_type                  0
expiry                            0
formatted_experience_level        0
listed_time                       0
posting_domain          

In [16]:
# In a new cell, below your final‐null‐check cell:
mode_remote = df['remote_allowed'].mode()[0]
df['remote_allowed'] = df['remote_allowed'].fillna(mode_remote)
print("remote_allowed nulls left:", df['remote_allowed'].isnull().sum())


remote_allowed nulls left: 0


In [17]:
# Fill missing company_id with a placeholder (-1)
df['company_id'] = df['company_id'].fillna(-1)
print("company_id nulls left:", df['company_id'].isnull().sum())


company_id nulls left: 0


In [19]:
df.to_csv('/home/gopaldeswal/Job-Trends-Demand-Forecasting/data/processed/jobs_clean.csv', index=False)

print("Cleaned data saved—shape:", df.shape)


Cleaned data saved—shape: (87184, 29)
