# GDP Data

In [None]:
import pandas as pd
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler

county = pd.read_pickle('../data/raw/county.pickle')
gdp = pd.read_pickle('../data/raw/gdp.pickle')
industry_relevance = pd.read_excel('../data/external/Ranking_Werkzeugverbrauch.xlsx', sheet_name='Pattern Industries')


In [None]:
# Calculate avg gdp over the years
gdp['avg_gdp'] = gdp.iloc[:, 1:].mean(axis=1)

In [None]:
display(industry_relevance['naics'].unique())
# log
display(gdp['IndustryClassification'].unique())

In [None]:
gdp = gdp[gdp['IndustryClassification'] != '...']


In [None]:
def parse_code_string(code_string):
    """
    Parses a string with numbers and ranges into a list of integers.
    Example: '42,44-46' -> [42, 44, 45, 46]
    """
    result = []
    # Split the string by commas to separate elements
    parts = code_string.split(',')
    
    for part in parts:
        if '-' in part:
            # Handle ranges (e.g., '44-46')
            start, end = map(int, part.split('-'))
            result.extend(range(start, end + 1))  # Add all numbers in the range
        else:
            # Handle single numbers
            result.append(int(part))
    
    return result

In [None]:
gdp['IndustryClassificationList'] = gdp['IndustryClassification'].apply(parse_code_string)

In [None]:
# ich möchte da wo die top indsutires sind eine neu column als liste die die subsindsutries zurodnert
# was brauche ich: die top industries in der column und die liste aller subcateogires
def generate_sub_industries_from_top_industries(top_industries_list, sub_industries):
    sub_industires = []
    for industry in top_industries_list:
        for sub_industry in sub_industries:
            if sub_industry.startswith(str(industry)):
                sub_industires.append(sub_industry)
    return sub_industires
    

In [None]:
# for every industry take mean of subIndustryrelvance and then take mean weighted with amout of subindustries, so that in the end i take a mean of subIndustries not of top Industires
sub_industries = industry_relevance['naics'].dropna()
gdp['subIndustryClassificationList'] = gdp['IndustryClassificationList'].apply(lambda x: generate_sub_industries_from_top_industries(x, sub_industries))


In [None]:
def calc_relevance_out_of_sub_industries(sub_industries, sub_industry_info):
    industry_relevance = sub_industry_info[sub_industry_info['naics'].isin(sub_industries)]
    #industry_relevance = industry_relevance[industry_relevance['Mittelwert'] >= 0.5]
    return (industry_relevance['Mittelwert'].fillna(0).mean() if not industry_relevance.empty else 0)
    
    

In [None]:
gdp['IndustryRelevance'] = gdp['subIndustryClassificationList'].apply(lambda x: calc_relevance_out_of_sub_industries(x, industry_relevance))

In [None]:
gdp['IndustryRelevance'].fillna(0)
gdp.head()


In [None]:
print(f'{(len(gdp[gdp['IndustryRelevance'] <= 0.5]) / len(gdp['IndustryRelevance'])):.2%} of rows have industries that are not relevant')
relevance_of_industries = gdp.groupby('Description')['IndustryRelevance'].mean().sort_values()

# -> Manufacturing and Information out because it is only Manufacturing
gdp_relevant_industries = relevance_of_industries[relevance_of_industries > 0.5].index
display(gdp_relevant_industries)


# by looking at industries it and their subcategories it seems to be most suitable to select the industires above 0.5 relvance and rate them as equal as we do not know the relvance
# of the other sub industires and how imprtant they are

# filtered by relvance
gdp = gdp[gdp['Description'].isin(gdp_relevant_industries)]
display(gdp['Description'].unique())

In [None]:
# remove  industries that are caputred in a diffrent industry 
gdp_industries_and_their_sub = gdp[['IndustryClassification', 'Description', 'subIndustryClassificationList']].drop_duplicates('IndustryClassification')
gdp_industries_and_their_sub['nSubIndustries'] = gdp_industries_and_their_sub['subIndustryClassificationList'].apply(lambda x: len(x))
display(gdp_industries_and_their_sub)
redundant_industries = ['   Durable goods manufacturing ', 'Manufacturing and information ', 'Natural resources and mining ']


gdp_industries_and_their_sub = gdp_industries_and_their_sub[~gdp_industries_and_their_sub['Description'].isin(redundant_industries)]
display(gdp_industries_and_their_sub)
#filter redundant industries
gdp = gdp[~gdp['Description'].isin(redundant_industries)]
display(gdp['Description'].unique())




In [None]:
print(gdp.columns)
print(gdp['Unit'].unique())

# caclualte devleopment for of gdp for each FIPS (with log)
# caclualte aveage devleopment for of gdp for each FIPS
# average gpd for FIPS

In [None]:
industires_per_fips = gdp.groupby(['Description'])['FIPS']
display(industires_per_fips)

In [None]:
years = ['2017', '2018', '2019', '2020', '2021', '2022']
columns_without_years = [item for item in gdp.columns if item not in years]
gdp_per_industry = gdp.groupby(['FIPS', 'Description'])[years].sum()
display(gdp_per_industry.describe())
gdp_per_industry = gdp_per_industry.replace(0, np.nan)
display(len(gdp_per_industry[(gdp_per_industry.isna()).sum(axis=1) == 0])/len(gdp_per_industry))
display((gdp_per_industry.isna()).sum()/len(gdp_per_industry))


rates = gdp_per_industry.pct_change(axis=1)
years.remove('2017')
rates['averageGrowthPerYear'] = rates[years].mean(axis=1)
gdp_per_industry['averageGrowthPerYear'] = rates['averageGrowthPerYear']
gdp_per_industry['averageGdp']= gdp_per_industry[years].mean(axis=1)

# ok in manchen fips habe ich  keine dieser industrien, daher ignorien dann dies industrien. ich brauch eine kennzahl die sagt wie viel vom gesamtmarkt von dieser Industrie in diesem FIPS ist 
# 
gdp_per_industry

def add_zeros(code):
    code = str(code)
    if len(code) == 3:
        return '00' + code
    elif len(code) == 4:
        return '0' + code
    elif len(code) == 1:
        return '0000' + code
    return code


In [None]:
gdp_per_industry = gdp_per_industry.reset_index()
gdp_per_industry = gdp_per_industry.drop(columns=years)
# Pivot the 'Description' column for 'averageGrowthPerYear'
pivoted_growth = gdp_per_industry.pivot(index='FIPS', columns='Description', values='averageGrowthPerYear')

# Rename the columns with the suffix `_averageGrowthPerYear`
pivoted_growth.columns = [f"{col}_averageGrowthPerYear" for col in pivoted_growth.columns]

# (Optional) Pivot for 'averageGDP' or other metrics
pivoted_gdp = gdp_per_industry.pivot(index='FIPS', columns='Description', values='averageGdp')
pivoted_gdp.columns = [f"{col}_averageGdp" for col in pivoted_gdp.columns]

# Merge the pivoted DataFrames back together if needed
feature_df = pivoted_growth.join(pivoted_gdp, how='outer')
feature_df = feature_df.replace(0, np.nan)


In [None]:
display(feature_df)

In [None]:
feature_df = feature_df.reset_index()
feature_df['FIPS'] = feature_df['FIPS'].apply(add_zeros)
display(feature_df)
pd.to_pickle(feature_df,'../data/processed/gdp.pickle')

# Scaling

In [None]:
# Drop 'FIPS' column
feature_df_no_fips = feature_df.drop(columns='FIPS')

plt.figure(figsize=(12,4))

# Plot points without NaN values for each column except 'FIPS'
for col in feature_df_no_fips.columns:
    # Remove NaN values
    non_nan_data = feature_df_no_fips[col].dropna()
    plt.scatter([col] * len(non_nan_data), non_nan_data, s=50, alpha=0.6, label=col)  # Set s to a fixed value

plt.xlabel("Features")
plt.ylabel("establishments")
plt.title("Scatterplot of Features")
plt.legend(title="Feature Legend")
plt.gca().axes.get_xaxis().set_visible(False)
plt.show()


In [None]:
# Drop the 'FIPS' column
feature_df_no_fips = feature_df.drop(columns='FIPS')

# Number of columns (features)
n_features = len(feature_df_no_fips.columns)

# Set up subplots (one row and as many columns as there are features)
fig, axes = plt.subplots(n_features, 1, figsize=(7, 4 * n_features))

# Create a histogram for each feature
for i, col in enumerate(feature_df_no_fips.columns):
    axes[i].hist(feature_df_no_fips[col].dropna(), bins=40, alpha=0.7, color='skyblue')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'Histogram of {col}')
    axes[i].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Drop the 'FIPS' column
feature_df_no_fips = feature_df.drop(columns='FIPS')

# Linear Standardization (Min-Max Scaling)
scaler_minmax = MinMaxScaler()
feature_df_minmax = scaler_minmax.fit_transform(feature_df_no_fips)

# Z-transformation (Standardization)
scaler_standard = StandardScaler()
feature_df_zscore = scaler_standard.fit_transform(feature_df_no_fips)

# Log10 Scaling (with a small constant to avoid log(0))
feature_df_log10 = np.log10(feature_df_no_fips + 1e-9)

# Plotting function with automatic y-limits adjustment
def plot_features(data, title, ax):
    for i, col in enumerate(feature_df_no_fips.columns):
        non_nan_data = data[:, i][~np.isnan(data[:, i])]  # Get non-NaN data for each feature
        ax.scatter([col] * len(non_nan_data), non_nan_data, s=20, alpha=0.6, label=col)
    ax.set_xlabel("Features")
    ax.set_ylabel("Values")
    ax.set_title(title)
       

# Create a 3x2 grid of subplots
fig, axes = plt.subplots(3, 2, figsize=(30, 24), sharey=False)
plt.gca().axes.get_xaxis().set_visible(False)


# List of data pairs and titles for each subplot
data_pairs = [
    (feature_df_no_fips.to_numpy(), "Original Data", feature_df_minmax, "Linear Standardization (Min-Max Scaling)"),
    (feature_df_no_fips.to_numpy(), "Original Data", feature_df_zscore, "Z-Transformation (Standardization)"),
    (feature_df_no_fips.to_numpy(), "Original Data", feature_df_log10.to_numpy(), "Log10 Scaling")
]

# Plot each pair of datasets in the grid
for row, (original_data, original_title, transformed_data, transformed_title) in enumerate(data_pairs):
    plt.gca().axes.get_xaxis().set_visible(False)

    # Plot original data in the left column
    plot_features(original_data, original_title, axes[row, 0])
    # Plot transformed data in the right column
    plot_features(transformed_data, transformed_title, axes[row, 1])

# Adjust layout and show legend for each subplot
for ax in axes.flat:
    if ax.has_data():
        plt.gca().axes.get_xaxis().set_visible(False)
        ax.legend(title="Feature Legend", loc='upper right')
plt.tight_layout()
plt.show()


In [None]:
# Drop the 'FIPS' column
feature_df_no_fips = feature_df.drop(columns='FIPS')

# Log10 Scaling (with a small constant to avoid log(0))
feature_df_log10 = np.log10(feature_df_no_fips + 1e-9)

# Plotting function for histograms (only for log10 scaling)
def plot_histograms_log10(data):
    n_features = data.shape[1]
    fig, axes = plt.subplots(n_features, 1, figsize=(10, 5 * n_features))

    # Loop through each feature and plot its histogram
    for i, col in enumerate(feature_df_no_fips.columns):
        axes[i].hist(data[:, i], bins=20, alpha=0.7, color='skyblue')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
        axes[i].set_title(f'Histogram of {col} (Log10 Scaling)')
        axes[i].grid(True)

    plt.tight_layout()
    plt.show()

# Plot histograms for Log10 scaling
plot_histograms_log10(feature_df_log10.to_numpy())
