In [80]:
#Import required libraries
from os import getenv
from dotenv import load_dotenv
import pandas as pd
import numpy as np

In [81]:
#load environment variables
load_dotenv()
# Load the environment variable for the interim data
# The environment variable point to the interim data file
load_interim_data = getenv("DATA_INTERIM")
# Check if the environment variable is set
if load_interim_data is None:
    raise ValueError("The environment variable 'DATA_INTERIM' is not set.")

# Load the interim data
layoffs_df = pd.read_csv(load_interim_data)
layoffs_df.head()


Unnamed: 0,company,location,total_laid_off,date,percentage_laid_off,industry,source,stage,funds_raised,country,date_added
0,CrowdStrike,SF Bay Area,500.0,5/7/2025,5%,Security,https://www.wsj.com/business/crowdstrike-to-cu...,Post-IPO,$1200,United States,5/7/2025
1,GenWise,"New Delhi,Non-U.S.",15.0,5/5/2025,20%,Other,https://entrackr.com/exclusive/exclusive-z47-b...,Seed,$3,India,5/5/2025
2,Deep Instinct,"Tel Aviv,Non-U.S.",20.0,5/4/2025,10%,Security,https://www.calcalistech.com/ctechnews/article...,Private Equity,$322,Israel,5/5/2025
3,SambaNova,SF Bay Area,77.0,4/25/2025,15%,AI,https://www.eetimes.com/sambanova-lays-off-15-...,Series D,$1100,United States,5/3/2025
4,Intel,Sacramento,22000.0,4/23/2025,20%,Hardware,https://www.bloomberg.com/news/articles/2025-0...,Post-IPO,$12,United States,4/23/2025


In [82]:

# Remove ',Non-U.S.' from 'location' and convert 'total_laid_off' to float
layoffs_df['location'] = layoffs_df['location'].str.replace(',Non-U.S.', '', regex=False)
layoffs_df['total_laid_off'] = layoffs_df['total_laid_off'].astype(int)

# Convert 'date' to datetime format and extract day, month, and year
layoffs_df['date'] = layoffs_df['date'].astype('datetime64[ns]')
layoffs_df['LayOffDay'] = layoffs_df['date'].dt.day
layoffs_df['LayOffMonth'] = layoffs_df['date'].dt.month
layoffs_df['LayOffYear'] = layoffs_df['date'].dt.year

# Convert 'total_laid_off' to numeric, handling errors gracefully
layoffs_df['percentage_laid_off'] = layoffs_df['percentage_laid_off'].str.replace('%', '', regex=False)
layoffs_df['percentage_laid_off'] = layoffs_df['percentage_laid_off'].astype(float) / 100

# Convert columns to string type
layoffs_df['industry'] = layoffs_df['industry'].astype(str)
layoffs_df['company'] = layoffs_df['company'].astype(str)
layoffs_df['source'] = layoffs_df['source'].astype(str)
layoffs_df['stage'] = layoffs_df['stage'].astype(str)
layoffs_df['country'] = layoffs_df['country'].astype(str)

# Remove $, commas, and whitespace
layoffs_df['funds_raised'] = layoffs_df['funds_raised'].str.replace('[\$,]', '', regex=True).str.strip()

# Convert to numeric (handles errors gracefully)
layoffs_df['funds_raised'] = pd.to_numeric(layoffs_df['funds_raised']).astype(float)


In [83]:
layoffs_df.head()

Unnamed: 0,company,location,total_laid_off,date,percentage_laid_off,industry,source,stage,funds_raised,country,date_added,LayOffDay,LayOffMonth,LayOffYear
0,CrowdStrike,SF Bay Area,500,2025-05-07,0.05,Security,https://www.wsj.com/business/crowdstrike-to-cu...,Post-IPO,1200.0,United States,5/7/2025,7,5,2025
1,GenWise,New Delhi,15,2025-05-05,0.2,Other,https://entrackr.com/exclusive/exclusive-z47-b...,Seed,3.0,India,5/5/2025,5,5,2025
2,Deep Instinct,Tel Aviv,20,2025-05-04,0.1,Security,https://www.calcalistech.com/ctechnews/article...,Private Equity,322.0,Israel,5/5/2025,4,5,2025
3,SambaNova,SF Bay Area,77,2025-04-25,0.15,AI,https://www.eetimes.com/sambanova-lays-off-15-...,Series D,1100.0,United States,5/3/2025,25,4,2025
4,Intel,Sacramento,22000,2025-04-23,0.2,Hardware,https://www.bloomberg.com/news/articles/2025-0...,Post-IPO,12.0,United States,4/23/2025,23,4,2025


In [84]:
# Change the column names and reorder them
header  = ['company', 'industry', 'location','country','LayOffDay','LayOffMonth','LayOffYear', 'total_laid_off', 'percentage_laid_off', 'funds_raised', 'stage' ,'source']
layoffs_df = layoffs_df[header]

# The environment variable point to the processed data file
output_path = getenv("DATA_PROCESSED")

# Check if the environment variable is set
if output_path is None:
    raise ValueError("The environment variable 'DATA_PROCESSED' is not set.")

layoffs_df.to_csv(output_path, index=False)