In [None]:
import pandas as pd
import os

postings_path = os.path.join('..', 'data', 'raw', 'job_postings.csv')
skills_path = os.path.join('..', 'data', 'raw', 'job_skills.csv')

postings_df = pd.read_csv(postings_path)
skills_df = pd.read_csv(skills_path)

print("Files loaded successfully from 'data/raw'.")

#select columns only
columns_to_keep = ['job_link', 'job_title', 'company']
postings_subset_df = postings_df[columns_to_keep]

#merge the dataframes
merged_df = pd.merge(postings_subset_df, skills_df, on='job_link', how='inner')
print(f"DataFrames merged. Total jobs: {len(merged_df)}")

#clean the 'job_skills' column
merged_df['job_skills'].fillna('', inplace=True)
merged_df['skills_list'] = merged_df['job_skills'].apply(lambda s: [skill.strip() for skill in s.split(',')])
print("Converted 'job_skills' string to a list.")

final_df = merged_df[['job_link', 'job_title', 'company', 'skills_list']]

print("\n--- Final Processed Data Sample ---")
display(final_df.head())

#save to the 'processed' directory
output_path = os.path.join('..', 'data', 'processed', 'jobs_processed.csv')
os.makedirs(os.path.dirname(output_path), exist_ok=True) # This is good practice
final_df.to_csv(output_path, index=False, encoding='utf-8')

print(f"\nSuccessfully saved the final data to '{output_path}'")

Files loaded successfully from 'data/raw'.
DataFrames merged. Total jobs: 12217
Converted 'job_skills' string to a list.

--- Final Processed Data Sample (Lean Version) ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['job_skills'].fillna('', inplace=True)


Unnamed: 0,job_link,job_title,company,skills_list
0,https://www.linkedin.com/jobs/view/senior-mach...,Senior Machine Learning Engineer,Jobs for Humanity,"[Machine Learning, Programming, Python, Scala,..."
1,https://www.linkedin.com/jobs/view/principal-s...,"Principal Software Engineer, ML Accelerators",Aurora,"[C++, Python, PyTorch, TensorFlow, MXNet, CUDA..."
2,https://www.linkedin.com/jobs/view/senior-etl-...,Senior ETL Data Warehouse Specialist,Adame Services LLC,"[ETL, Data Integration, Data Transformation, D..."
3,https://www.linkedin.com/jobs/view/senior-data...,Senior Data Warehouse Developer / Architect,Morph Enterprise,"[Data Lakes, Data Bricks, Azure Data Factory P..."
4,https://www.linkedin.com/jobs/view/lead-data-e...,Lead Data Engineer,Dice,"[Java, Scala, Python, RDBMS, NoSQL, Redshift, ..."



Successfully saved the final lean data to '..\data\processed\jobs_processed.csv'
