In [2]:
import pandas as pd

# Load the CSV files
machinelearning_df = pd.read_csv('./MachineLearning.csv')
predictions_df = pd.read_csv('./predictions_with_sentiment.csv')

# Drop the 'idML' column
machinelearning_df = machinelearning_df.drop(columns=['idML'])

# Remove all data before 2019 in machinelearning_df
machinelearning_df = machinelearning_df[machinelearning_df['ref_date'] >= '2019-01-01']

# Convert ref_date to datetime format in both DataFrames
machinelearning_df['ref_date'] = pd.to_datetime(machinelearning_df['ref_date'])
predictions_df['ref_date'] = pd.to_datetime(predictions_df['ref_date'])

# Perform an outer merge
merged_df = pd.merge(
    machinelearning_df,
    predictions_df,
    on=['ref_date', 'geo', 'noc_code', 'noc_desc', 'job_char'],
    how='outer'
)

# Define sentiment score mapping
sentiment_mapping = {
    2019: 1.826354,
    2020: 1.826354,
    2021: 0.930538,
    2022: 0.000000,
    2023: 0.000000,
    2024: 0.568935
}

# Propagate sentiment scores forward for future years
future_years = range(2025, 2027)
for year in future_years:
    sentiment_mapping[year] = 0.568935

# Map sentiment scores based on the year of the ref_date
merged_df['sentiment_score'] = merged_df['ref_date'].dt.year.map(sentiment_mapping)

# Ensure predicted vacancies are empty for 2015-2022
merged_df['predicted_total_vacancies'] = merged_df.apply(
    lambda row: row['predicted_total_vacancies']
    if row['ref_date'].year >= 2023
    else None,
    axis=1
)

# Drop the 'predicted_vacancies' column
merged_df = merged_df.drop(columns=['predicted_vacancies'])

# Drop duplicate rows
merged_df = merged_df.drop_duplicates()

# Save the merged data to a new CSV
merged_df.to_csv('Merged_Data_Final.csv', index=False)

print("Merged data saved successfully to 'Merged_Data_Final.csv'")


Merged data saved successfully to 'Merged_Data_Final.csv'
