In [21]:
# Standard init
import pandas as pd
import numpy as np
import os

In [3]:
# Load the dataset
file_path = 'Excel-data/ca-st-ind-emp-2023-2025.csv'  # Update with the correct path if necessary
data = pd.read_csv(file_path)

# Display the first few rows to understand the structure
print("Initial Data Overview:")
display(data.head())

Initial Data Overview:


Unnamed: 0,Area Type,Area Name,Period,Series Code,Industry Title,Base Quarter Employment Estimate,Projected Quarter Employment Estimate,Numeric Change,Percentage Change
0,State,California,2023-2025,1,Total Employment,19920000,20412400,492400,2.5
1,State,California,2023-2025,6010,Self Employment,1319000,1344000,25000,1.9
2,State,California,2023-2025,8010,Private Household Workers,35100,32600,-2500,-7.1
3,State,California,2023-2025,11000000,Total Farm,472200,469600,-2600,-0.6
4,State,California,2023-2025,0,Total Nonfarm,18093700,18566200,472500,2.6


In [5]:
# Handle Missing Values
# Fill numeric columns with 0 
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_cols] = data[numeric_cols].fillna(0)

# Fill categorical columns with 'Unknown' or mode
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].fillna('Unknown')

In [13]:
# Standardize Column Names adds lowercase and replace spaces with underscores
data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_')
# Ensures data/time
if 'date' in data.columns:
    data['date'] = pd.to_datetime(data['date'], errors='coerce')
# Progress check
print("Initial Data Overview:")
display(data.head())

Initial Data Overview:


Unnamed: 0,area_type,area_name,period,series_code,industry_title,base_quarter_employment_estimate,projected_quarter_employment_estimate,numeric_change,percentage_change
0,State,California,2023-2025,1,Total Employment,19920000,20412400,492400,2.5
1,State,California,2023-2025,6010,Self Employment,1319000,1344000,25000,1.9
2,State,California,2023-2025,8010,Private Household Workers,35100,32600,-2500,-7.1
3,State,California,2023-2025,11000000,Total Farm,472200,469600,-2600,-0.6
4,State,California,2023-2025,0,Total Nonfarm,18093700,18566200,472500,2.6


In [19]:
# Convert specific columns to numeric (if needed)
# Example: If a column like 'employment' has mixed types
if 'employment' in data.columns:
    data['employment'] = pd.to_numeric(data['employment'], errors='coerce').fillna(0)

# Remove Duplicates
data = data.drop_duplicates()

In [23]:
# Save the Cleaned Data
output_folder = 'Excel-data'
output_filename = 'cleaned_data.csv'
output_path = os.path.join(output_folder, output_filename)
data.to_csv(output_path, index=False)

# Display the cleaned data summary
print("Cleaned Data Overview:")
display(data.info())
display(data.head())

Cleaned Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 9 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   area_type                              266 non-null    object 
 1   area_name                              266 non-null    object 
 2   period                                 266 non-null    object 
 3   series_code                            266 non-null    int64  
 4   industry_title                         266 non-null    object 
 5   base_quarter_employment_estimate       266 non-null    int64  
 6   projected_quarter_employment_estimate  266 non-null    int64  
 7   numeric_change                         266 non-null    int64  
 8   percentage_change                      266 non-null    float64
dtypes: float64(1), int64(4), object(4)
memory usage: 18.8+ KB


None

Unnamed: 0,area_type,area_name,period,series_code,industry_title,base_quarter_employment_estimate,projected_quarter_employment_estimate,numeric_change,percentage_change
0,State,California,2023-2025,1,Total Employment,19920000,20412400,492400,2.5
1,State,California,2023-2025,6010,Self Employment,1319000,1344000,25000,1.9
2,State,California,2023-2025,8010,Private Household Workers,35100,32600,-2500,-7.1
3,State,California,2023-2025,11000000,Total Farm,472200,469600,-2600,-0.6
4,State,California,2023-2025,0,Total Nonfarm,18093700,18566200,472500,2.6
