In [None]:
import pandas as pd
import numpy as np
import pickle
import os

In [None]:
from my_functions.functions_feature_selection import LoadConsumptionData, OccupationDataProcessor, PatternDataProcessor, Prioritization, MasterDFBuilder, FeatureAggregator

# Load data

## county information

In [None]:
path = 'data/original_data/pkl/county_information.pkl'
with open(path, 'rb') as file:
    county_information = pickle.load(file)
print(county_information['FIPS'].nunique())
county_information.head()

## tool consumption

In [None]:
file_path = 'data/processed_data/pkl/tool_consumption_results.pkl'
consumption_data = LoadConsumptionData(file_path)  # Load the consumption data from the specified file

tool_consumption_industry = consumption_data.process_tool_consumption_industry()  # Process tool consumption by industry
tool_consumption_occupation = consumption_data.tool_consumption_occupation  # Process tool consumption by occupation

In [None]:
tool_consumption_industry.head()

In [None]:
tool_consumption_occupation.head()

## occupation data

In [None]:
# Initialize the OccupationDataProcessor with the cleaned data and tool consumption by occupation

processor = OccupationDataProcessor("data/processed_data/pkl/df_cleaned.pickle", tool_consumption_occupation)
occupation_data_final, naics_emp_relevant_occupations, original_occupation_df  = processor.process_occupation_data()  # Process the occupation data
occupation_data_final.head()  # Display the first few rows of the processed occupation data

## naics data

In [None]:
# Initialize the PatternDataProcessor with the pattern data, tool consumption by industry, and relevant occupations

pattern_data_path = "data/processed_data/pkl/df_pattern_4d_filtered.pickle"
processor = PatternDataProcessor(pattern_data_path, tool_consumption_industry, naics_emp_relevant_occupations)
pattern_data_final, original_pattern_df = processor.process_pattern_data()  # Process the pattern data
pattern_data_final.head()  # Display the first few rows of the processed pattern data

# Prioritization

## Occupation prioritization (SOC)

In [None]:
# This code initializes the 'Prioritization' class with two datasets: 'occupation_data_final' and 'pattern_data_final'.
# Then, it calls the 'rank_occupation_columns' method to rank the occupation-related columns based on the prioritization logic.
# Finally, it displays the first few rows of the ranked occupation data.

prioritization = Prioritization(occupation_data_final, pattern_data_final)
ranked_occupation_data = prioritization.rank_occupation_columns()
ranked_occupation_data.head()

In [None]:
# This code defines a dictionary 'weight_dict' that assigns specific weights to various columns related to rankings.
# The 'calculate_weighted_rank_sum' method of the 'prioritization' object is then called, passing in the ranked occupation data 
# and the weight dictionary, to compute a weighted sum of the ranks.
# Finally, it displays the top 25 rows of the weighted ranked occupation data.

weight_dict = {
    'rank_emp_sum': 0.35,
    'rank_unique_FIPS': 0.3,
    'rank_consumption product 1': 0.0,
    'rank_consumption product 2': 0.0,
    'rank_consumption product 3': 0.0,
    'rank_consumption product 4': 0.0,
    'rank_consumption product 5': 0.0
}

ranked_occupation_data_weighted = prioritization.calculate_weighted_rank_sum(ranked_occupation_data, weight_dict)
ranked_occupation_data_weighted.head(25)

## Industry prioritization (NAICS)

In [None]:
# This code initializes the 'Prioritization' class with two datasets: 'occupation_data_final' and 'pattern_data_final'.
# It then calls the 'rank_pattern_columns' method to rank the pattern-related columns based on the prioritization logic.
# Finally, it displays the first few rows of the ranked pattern data.

prioritization = Prioritization(occupation_data_final, pattern_data_final)
ranked_pattern_data  = prioritization.rank_pattern_columns()
ranked_pattern_data.head()

In [None]:
# This code defines a dictionary 'weight_dict' that assigns specific weights to various columns related to pattern rankings.
# The 'calculate_weighted_rank_sum' method of the 'prioritization' object is called, passing in the ranked pattern data 
# and the weight dictionary, to compute a weighted sum of the ranks.
# Finally, it displays the top 25 rows of the weighted ranked pattern data.

weight_dict = {
    'rank_emp_sum': 0.35,
    'rank_est_sum': 0.1,
    'rank_unique_FIPS': 0.3,
    'rank_consumption product 1': 0.0,
    'rank_consumption product 2': 0.0,
    'rank_consumption product 3': 0.0,
    'rank_consumption product 4': 0.0,
    'rank_consumption product 5': 0.0
}

ranked_pattern_data_weighted = prioritization.calculate_weighted_rank_sum(ranked_pattern_data, weight_dict)
ranked_pattern_data_weighted.head(25)

# Build master_df

In [None]:
builder = MasterDFBuilder(original_occupation_df, original_pattern_df, county_information)

In [None]:
# This code extracts the top 10 and the next 10 'OCC_CODE' values from the weighted ranked occupation data and stores them 
# in two separate lists: 'occ_top10' and 'occ_top10_20'.
# It also defines two lists of NAICS codes: 'naics_top6' for the top 6 NAICS codes and 'naics_top_metall' for NAICS codes 
# related to the metallurgical industry.
# Finally, it prints the contents of the 'occ_top10', 'occ_top10_20', 'naics_top6', and 'naics_top_metall' lists.

#occ_top10 = ranked_occupation_data_weighted['OCC_CODE'][:10].tolist()
occ_top10 = ['51-4121', '47-2221', '51-2041', '49-3021', '51-4041', '49-9041', '49-9071', '51-4081', '47-2211', '49-3031']
#occ_top10_20 = ranked_occupation_data_weighted['OCC_CODE'][10:20].tolist()
occ_top10_20 = ['51-4033', '49-3023', '47-2011', '51-4122', '51-9021', '51-4031', '49-3011', '51-4111', '51-9032', '49-9043']
naics_top6 = ['2382','8111','3320A2','3330A1','3327','3363']
naics_top_metall = ['3320A1','3335','3364','3362','3315','3366','2379','3336','3311','3314']
print(occ_top10)
print(occ_top10_20)
print(naics_top6)
print(naics_top_metall)

In [None]:
# This code calls the 'build_master_df' method of the 'builder' object, passing in the previously defined lists of 'occ_top10', 
# 'occ_top10_20', 'naics_top6', and 'naics_top_metall'. It also specifies the path where the resulting master dataframe 
# should be saved as a pickle file ('processed_data/pkl/master_df.pkl').
# The method returns the created master dataframe, which is then stored in the 'master_df' variable.

master_df = builder.build_master_df(
    occ_top10=occ_top10,
    occ_top10_20=occ_top10_20,
    naics_top6=naics_top6,
    naics_top_metall=naics_top_metall,
    save_path='data/processed_data/pkl/master_df.pkl'  # Ersetze 'path/to/' durch deinen gewünschten Dateipfad
)
master_df

In [None]:
# This code defines a dictionary 'aggregate_columns' that maps column names to the corresponding columns in 'master_df' 
# and specifies the aggregation method ('sum'). The dictionary includes aggregations for different employee and establishment 
# data, divided by different occupation and NAICS categories.
# The 'FeatureAggregator' class is then instantiated with the 'master_df' and a path to save the final dataframe.
# The 'aggregate_columns' method of the 'FeatureAggregator' object is called to perform the aggregation based on the defined columns,
# and the resulting aggregated dataframe is stored in the 'result_df' variable.


aggregate_columns = {
    'emp_top10_to_20': (master_df.columns[11:21].to_list(), 'sum'),
    'total_emp_naics_top6': ([col for col in master_df.columns[21:33] if 'total_emp_naics' in col], 'sum'),
    'total_est_naics_top6': ([col for col in master_df.columns[21:33] if 'total_est_naics' in col], 'sum'),
    'total_emp_naics_metal': ([col for col in master_df.columns[33:53] if 'total_emp_naics' in col], 'sum'),
    'total_est_naics_metal': ([col for col in master_df.columns[33:53] if 'total_est_naics' in col], 'sum'),
}

aggregator = FeatureAggregator(master_df, 'data/processed_data/pkl/final_df.pickle')
result_df = aggregator.aggregate_columns(aggregate_columns)
result_df