In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_excel('entropy_edu.xlsx')
df.head()

Unnamed: 0,State,Population 25 years and over,Less than 9th grade,9th to 12th grade,High school graduate,"Some college, no degree",Associate's degree,Bachelor's degree,Graduate or professional degree
0,Alabama,3474924,0.033967,0.078063,0.304224,0.206089,0.089653,0.175347,0.112657
1,Alaska,489218,0.022383,0.044395,0.291189,0.237273,0.098872,0.192487,0.113401
2,Arizona,5053656,0.047781,0.060705,0.238035,0.229722,0.094068,0.204219,0.125471
3,Arkansas,2057624,0.040211,0.068378,0.342954,0.212904,0.08147,0.157444,0.096639
4,California,26866773,0.088466,0.064722,0.204969,0.193282,0.078738,0.225415,0.144408


In [5]:
def normalize_data(df):
    """
    Normalize the data for each education level across states.
    """
    # Exclude the first two columns (State and total population) from normalization
    education_columns = df.columns[2:]
    # Apply min-max normalization
    df_normalized = df.copy()
    df_normalized[education_columns] = df[education_columns].apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
    
    return df_normalized

# Normalize the dataset
data_normalized = normalize_data(df)

# Display the first few rows of the normalized data
data_normalized.head()


Unnamed: 0,State,Population 25 years and over,Less than 9th grade,9th to 12th grade,High school graduate,"Some college, no degree",Associate's degree,Bachelor's degree,Graduate or professional degree
0,Alabama,3474924,0.19786,0.859566,0.681452,0.679005,0.517815,0.185877,0.078483
1,Alaska,489218,0.086841,0.217475,0.629061,0.911314,0.602577,0.30929,0.080962
2,Arizona,5053656,0.330237,0.528519,0.415416,0.855064,0.558408,0.393764,0.121157
3,Arkansas,2057624,0.257699,0.674866,0.837121,0.729773,0.442578,0.056976,0.025137
4,California,26866773,0.720138,0.605139,0.282513,0.583592,0.417464,0.546379,0.184222


In [6]:
def calculate_entropy(df_normalized):
    """
    Calculate the entropy for each education level.
    """
    # Exclude the first two columns (State and total population) for entropy calculation
    education_columns = df_normalized.columns[2:]
    # Calculate the proportion of each state's education level relative to the sum across all states
    proportion_matrix = df_normalized[education_columns].apply(lambda x: x / x.sum(), axis=0)
    # Calculate entropy using the entropy formula
    entropy = -np.sum(proportion_matrix * np.log(proportion_matrix + np.finfo(float).eps), axis=0) / np.log(len(df_normalized))
    
    return entropy

# Calculate the entropy for each education level
entropy = calculate_entropy(data_normalized)

# Display the entropy values
entropy


Less than 9th grade                0.945653
9th to 12th grade                  0.940634
High school graduate               0.986068
Some college, no degree            0.978443
Associate's degree                 0.986811
Bachelor's degree                  0.967013
Graduate or professional degree    0.917009
dtype: float64

In [7]:
def calculate_weights(entropy):
    """
    Calculate the weights for each education level based on entropy.
    """
    # The weight for an indicator is calculated as (1 - entropy) normalized by the sum of (1 - entropy) for all indicators
    weights = (1 - entropy) / (1 - entropy).sum()
    
    return weights

# Calculate the weights for each education level
weights = calculate_weights(entropy)

# Display the weights
weights

Less than 9th grade                0.195234
9th to 12th grade                  0.213265
High school graduate               0.050048
Some college, no degree            0.077440
Associate's degree                 0.047381
Bachelor's degree                  0.118500
Graduate or professional degree    0.298133
dtype: float64

In [8]:
def calculate_comprehensive_scores(df_normalized, weights):
    """
    Calculate the comprehensive evaluation scores for each state.
    """
    # Exclude the first two columns (State and total population) for score calculation
    education_columns = df_normalized.columns[2:]
    # Calculate the comprehensive scores by multiplying the normalized values with their weights and summing them up
    comprehensive_scores = df_normalized[education_columns].dot(weights)
    return comprehensive_scores

# Calculate the comprehensive evaluation scores for each state
comprehensive_scores = calculate_comprehensive_scores(data_normalized, weights)

# Add the comprehensive scores to the original data
data_with_scores = df.copy()
data_with_scores['Comprehensive Score'] = comprehensive_scores

# Display the states ranked by their comprehensive scores
data_with_scores.sort_values(by='Comprehensive Score', ascending=False).reset_index(drop=True)
# Save the data with comprehensive scores to a new Excel file
data_with_scores.to_excel('edu_EWM_scores.xlsx', index=False)

# Display the data with state and comprehensive scores
data_with_scores[['State', 'Comprehensive Score']].sort_values(by='Comprehensive Score', ascending=False).reset_index(drop=True)

Unnamed: 0,State,Comprehensive Score
0,California,0.468431
1,Puerto Rico,0.448173
2,District of Columbia,0.447841
3,Texas,0.430217
4,New York,0.41493
5,Louisiana,0.402803
6,New Mexico,0.395972
7,Mississippi,0.384362
8,Nevada,0.382845
9,Alabama,0.378591


In [12]:
# Now let the user input the education data for a new region.
# Use an interactive approach to allow the user to input the data on the spot.
# Calculate the comprehensive score for the new region using the existing weights.

# Ask the user to input the education data for a new region
new_region_data = []
print("Please enter the education data for the new region:")
for column in data_normalized.columns[2:]:
    value = float(input(f"{column}: "))
    new_region_data.append(value)

# Calculate the comprehensive score for the new region using the existing weights
comprehensive_score_new_region = sum(np.array(new_region_data) * weights)
print(f"The comprehensive score for the new region is: {comprehensive_score_new_region}")

Please enter the education data for the new region:
The comprehensive score for the new region is: 1.0
