In [None]:
import numpy as np
import pandas as pd

LOADING DATASET

In [None]:
df=pd.read_csv("/content/ai_job_dataset.csv")

UNDERSTANDING DATA

In [None]:
df.shape

(15000, 19)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   job_id                  15000 non-null  object 
 1   job_title               15000 non-null  object 
 2   salary_usd              15000 non-null  int64  
 3   salary_currency         15000 non-null  object 
 4   experience_level        15000 non-null  object 
 5   employment_type         15000 non-null  object 
 6   company_location        15000 non-null  object 
 7   company_size            15000 non-null  object 
 8   employee_residence      15000 non-null  object 
 9   remote_ratio            15000 non-null  int64  
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  int64  
 13  industry                15000 non-null  object 
 14  posting_date            15000 non-null

In [None]:
df.describe(include='all')

Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
count,15000,15000,15000.0,15000,15000,15000,15000,15000,15000,15000.0,15000,15000,15000.0,15000,15000,15000,15000.0,15000.0,15000
unique,15000,20,,3,4,4,20,3,20,,13663,4,,15,486,543,,,16
top,AI15000,Machine Learning Researcher,,USD,MI,FT,Germany,S,Sweden,,"Python, TensorFlow, PyTorch",Bachelor,,Retail,2024-07-05,2025-01-05,,,TechCorp Inc
freq,1,808,,11957,3781,3812,814,5007,790,,17,3789,,1063,51,47,,,980
mean,,,115348.965133,,,,,,,49.483333,,,6.2532,,,,1503.314733,7.504273,
std,,,60260.940438,,,,,,,40.812712,,,5.545768,,,,576.127083,1.45087,
min,,,32519.0,,,,,,,0.0,,,0.0,,,,500.0,5.0,
25%,,,70179.75,,,,,,,0.0,,,2.0,,,,1003.75,6.2,
50%,,,99705.0,,,,,,,50.0,,,5.0,,,,1512.0,7.5,
75%,,,146408.5,,,,,,,100.0,,,10.0,,,,2000.0,8.8,


In [None]:
df.isnull().sum()

Unnamed: 0,0
job_id,0
job_title,0
salary_usd,0
salary_currency,0
experience_level,0
employment_type,0
company_location,0
company_size,0
employee_residence,0
remote_ratio,0


DATA CLEANING

In [None]:
df['salary_currency'].unique()

array(['USD', 'EUR', 'GBP'], dtype=object)

In [None]:
#Convert All Salaries to USD Using Exchange Rates
exchange_rates = {
    'USD': 1.00,
    'EUR': 1.08,
    'GBP': 1.27
}
df['salary_usd_converted'] = df.apply(
    lambda x: round(x['salary_usd'] * exchange_rates.get(x['salary_currency'], 1.00)), axis=1
)

df['salary_usd'] = df['salary_usd_converted']
df.drop(columns=['salary_usd_converted', 'salary_currency'], inplace=True)

In [None]:
#converting the data types to numeric
df['salary_usd'] = pd.to_numeric(df['salary_usd'], errors='coerce')
df['years_experience'] = pd.to_numeric(df['years_experience'], errors='coerce')
df['benefits_score'] = pd.to_numeric(df['benefits_score'], errors='coerce')

In [None]:
#removing outliers
salary_cap = df['salary_usd'].quantile(0.99)
df = df[(df['salary_usd'] > 10000) & (df['salary_usd'] < salary_cap)]
df = df[df['years_experience'] <= 40]

In [None]:
#Clean Text Columns
df['job_title'] = df['job_title'].astype(str).str.strip().str.title()
df['company_name'] = df['company_name'].astype(str).str.strip().str.upper()
df['industry'] = df['industry'].astype(str).str.strip().str.title()
df['education_required'] = df['education_required'].astype(str).str.strip().str.title()

In [None]:
df['experience_level'].unique()

array(['SE', 'EN', 'MI', 'EX'], dtype=object)

In [None]:
experience_mapping = {
    'EN': 'Entry-level',
    'MI': 'Mid-level',
    'SE': 'Senior-level',
    'EX': 'Executive-level'
}

df['experience_level'] = df['experience_level'].map(experience_mapping).fillna('Unknown')

In [None]:
size_mapping = {
    'S': 'Small',
    'M': 'Medium',
    'L': 'Large'
}

df['company_size'] = df['company_size'].map(size_mapping).fillna('Unknown')

In [None]:
df['experience_level'] = df['experience_level'].str.strip().str.upper()
df['company_size'] = df['company_size'].str.strip().str.upper()

In [None]:
df['employment_type'] = df['employment_type'].astype(str).str.strip().str.upper()
# Define the mapping
employment_mapping = {
    'FT': 'Full-time',
    'PT': 'Part-time',
    'CT': 'Contract',
    'FL': 'Freelance'
}
df['employment_type'] = df['employment_type'].map(employment_mapping).fillna('Unknown')

In [None]:
df['posting_date'] = pd.to_datetime(df['posting_date'], errors='coerce')
df['application_deadline'] = pd.to_datetime(df['application_deadline'], errors='coerce')

In [None]:
df['days_to_apply'] = (df['application_deadline'] - df['posting_date']).dt.days

In [None]:
df['required_skills'] = df['required_skills'].astype(str).fillna('')
df['num_skills'] = df['required_skills'].apply(lambda x: len(x.split(',')) if x.strip() else 0)

In [None]:
df['benefits_score'] = df['benefits_score'].fillna(df['benefits_score'].mean())
df.fillna('Unknown', inplace=True)

FINAL CLEANED DATA

In [None]:
df.head()

Unnamed: 0,job_id,job_title,salary_usd,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name,days_to_apply,num_skills
0,AI00001,Ai Research Scientist,90376,SENIOR-LEVEL,Contract,China,MEDIUM,China,50,"Tableau, PyTorch, Kubernetes, Linux, NLP",Bachelor,9,Automotive,2024-10-18,2024-11-07,1076,5.9,SMART ANALYTICS,20,5
1,AI00002,Ai Software Engineer,61895,ENTRY-LEVEL,Contract,Canada,MEDIUM,Ireland,100,"Deep Learning, AWS, Mathematics, Python, Docker",Master,1,Media,2024-11-20,2025-01-11,1268,5.2,TECHCORP INC,52,5
2,AI00003,Ai Specialist,152626,MID-LEVEL,Freelance,Switzerland,LARGE,South Korea,0,"Kubernetes, Deep Learning, Java, Hadoop, NLP",Associate,2,Education,2025-03-18,2025-04-07,1974,9.4,AUTONOMOUS TECH,20,5
3,AI00004,Nlp Engineer,80215,SENIOR-LEVEL,Freelance,India,MEDIUM,India,50,"Scala, SQL, Linux, Python",Phd,7,Consulting,2024-12-23,2025-02-24,1345,8.6,FUTURE SYSTEMS,63,4
4,AI00005,Ai Consultant,58994,ENTRY-LEVEL,Part-time,France,SMALL,Singapore,100,"MLOps, Java, Tableau, Python",Master,0,Media,2025-04-15,2025-06-23,1989,6.6,ADVANCED ROBOTICS,69,4


EXPORTING CLEANED DATA

In [41]:
df.to_csv("ai_jobs_cleaned.csv", index=False)