In [1]:
import pandas as pd

# Read the Combined Dataset Combined_Dataset.Elsx file
Final_Data = pd.read_excel('/content/drive/My Drive/Combined_Dataset.xlsx')

In [2]:
from scipy.stats import f_oneway, chi2_contingency
from sklearn.preprocessing import OneHotEncoder

Final_Data['adjectives'] = Final_Data[['adjective1', 'adjective2', 'adjective3']].apply(lambda x: ' '.join(x.dropna().astype(str)).lower(), axis=1)

# Salary Analysis

# Calculate average salaries by race and gender
avg_salary_race_gender = Final_Data.groupby(['Race', 'Gender'])[['predictedSalary', 'deservedSalary']].mean().reset_index()

# Calculate average salaries by race, gender, and job category
avg_salary_race_gender_category = Final_Data.groupby(['Race', 'Gender', 'Category'])[['predictedSalary', 'deservedSalary']].mean().reset_index()

# Display the average salary tables
print("Average Salary by Race and Gender:")
print(avg_salary_race_gender)
print("\nAverage Salary by Race, Gender, and Category:")
print(avg_salary_race_gender_category)

# Perform separate ANOVA tests for predictedSalary and deservedSalary by Race and Gender

# ANOVA test for predictedSalary by Race
anova_predicted_race = f_oneway(*[Final_Data[Final_Data['Race'] == race]['predictedSalary'].dropna() for race in Final_Data['Race'].unique()])
print("\nANOVA test for Predicted Salary by Race:", anova_predicted_race)

# ANOVA test for deservedSalary by Race
anova_deserved_race = f_oneway(*[Final_Data[Final_Data['Race'] == race]['deservedSalary'].dropna() for race in Final_Data['Race'].unique()])
print("ANOVA test for Deserved Salary by Race:", anova_deserved_race)

# ANOVA test for predictedSalary by Gender
anova_predicted_gender = f_oneway(*[Final_Data[Final_Data['Gender'] == gender]['predictedSalary'].dropna() for gender in Final_Data['Gender'].unique()])
print("\nANOVA test for Predicted Salary by Gender:", anova_predicted_gender)

# ANOVA test for deservedSalary by Gender
anova_deserved_gender = f_oneway(*[Final_Data[Final_Data['Gender'] == gender]['deservedSalary'].dropna() for gender in Final_Data['Gender'].unique()])
print("ANOVA test for Deserved Salary by Gender:", anova_deserved_gender)


Average Salary by Race and Gender:
                     Race  Gender  predictedSalary  deservedSalary
0  Asian/Pacific Islander  Female     82732.565848    83739.822248
1  Asian/Pacific Islander    Male     80240.453484    82088.642941
2                   Black  Female     80431.675795    82440.153587
3                   Black    Male     80280.600483    81860.125235
4                Hispanic  Female     82330.657672    84663.068240
5                Hispanic    Male     82660.231172    83878.687041
6         Native American  Female     80162.868304    82278.344668
7         Native American    Male     81498.175997    82888.479762
8                   White  Female     80455.117600    82922.474379
9                   White    Male     80071.727749    81097.315949

Average Salary by Race, Gender, and Category:
                       Race  Gender                Category  predictedSalary  \
0    Asian/Pacific Islander  Female              ACCOUNTANT     83825.305085   
1    Asian/Pacific Is

In [3]:
import statsmodels.api as sm
import statsmodels.formula.api as ols
from statsmodels.stats.anova import anova_lm


# Two-Way ANOVA for predictedSalary (Race and Gender)
model_predicted = ols.ols('predictedSalary ~ C(Race) + C(Gender) + C(Race):C(Gender)', data=Final_Data).fit()
anova_predicted = anova_lm(model_predicted)
print("\nTwo-Way ANOVA for Predicted Salary (Race and Gender):")
print(anova_predicted)

# Two-Way ANOVA for deservedSalary (Race and Gender)
model_deserved = ols.ols('deservedSalary ~ C(Race) + C(Gender) + C(Race):C(Gender)', data=Final_Data).fit()
anova_deserved = anova_lm(model_deserved)
print("\nTwo-Way ANOVA for Deserved Salary (Race and Gender):")
print(anova_deserved)



Two-Way ANOVA for Predicted Salary (Race and Gender):
                        df        sum_sq       mean_sq         F    PR(>F)
C(Race)                4.0  1.699137e+10  4.247844e+09  1.355799  0.246597
C(Gender)              1.0  4.604021e+08  4.604021e+08  0.146948  0.701472
C(Race):C(Gender)      4.0  9.809411e+09  2.452353e+09  0.782726  0.536175
Residual           24820.0  7.776335e+13  3.133092e+09       NaN       NaN

Two-Way ANOVA for Deserved Salary (Race and Gender):
                        df        sum_sq       mean_sq         F    PR(>F)
C(Race)                4.0  1.623387e+10  4.058468e+09  1.207070  0.305405
C(Gender)              1.0  4.444093e+09  4.444093e+09  1.321763  0.250287
C(Race):C(Gender)      4.0  4.720099e+09  1.180025e+09  0.350963  0.843522
Residual           24820.0  8.345098e+13  3.362247e+09       NaN       NaN


In [4]:
# Adjective Analysis

# Split adjectives into a list and convert them to lowercase
Final_Data['adjective_list'] = Final_Data['adjectives'].str.lower().str.split()

# Flatten the list of adjectives across all rows
all_adjectives = [adj for sublist in Final_Data['adjective_list'].dropna() for adj in sublist]

# Explode the adjective list to prepare for one-hot encoding
filtered_data = Final_Data.explode('adjective_list')

# One-hot encode all adjectives
one_hot_encoded = pd.get_dummies(filtered_data['adjective_list'])

# Append race and gender for group-by counts
adjective_counts = filtered_data[['Race', 'Gender']].join(one_hot_encoded).groupby(['Race', 'Gender']).sum()

# Perform chi-squared test for race
chi2_race = chi2_contingency(adjective_counts.groupby(level=0).sum().T)
print("\nChi-squared test for adjectives by Race:", chi2_race)

# Perform chi-squared test for gender
chi2_gender = chi2_contingency(adjective_counts.groupby(level=1).sum().T)
print("Chi-squared test for adjectives by Gender:", chi2_gender)



Chi-squared test for adjectives by Race: Chi2ContingencyResult(statistic=425.38059508146387, pvalue=1.5134228574935192e-30, dof=140, expected_freq=array([[1269.6, 1269.6, 1269.6, 1269.6, 1269.6],
       [ 910.2,  910.2,  910.2,  910.2,  910.2],
       [1154.4, 1154.4, 1154.4, 1154.4, 1154.4],
       [  57.6,   57.6,   57.6,   57.6,   57.6],
       [2529. , 2529. , 2529. , 2529. , 2529. ],
       [5762.4, 5762.4, 5762.4, 5762.4, 5762.4],
       [ 596.4,  596.4,  596.4,  596.4,  596.4],
       [ 205.2,  205.2,  205.2,  205.2,  205.2],
       [ 684.6,  684.6,  684.6,  684.6,  684.6],
       [1034.4, 1034.4, 1034.4, 1034.4, 1034.4],
       [1175.4, 1175.4, 1175.4, 1175.4, 1175.4],
       [  70.8,   70.8,   70.8,   70.8,   70.8],
       [  57.6,   57.6,   57.6,   57.6,   57.6],
       [1930.8, 1930.8, 1930.8, 1930.8, 1930.8],
       [  57.6,   57.6,   57.6,   57.6,   57.6],
       [ 134.4,  134.4,  134.4,  134.4,  134.4],
       [3667.2, 3667.2, 3667.2, 3667.2, 3667.2],
       [2562.6, 256

In [5]:
from scipy.stats import chi2_contingency

# Group by Gender and Race, summing over the one-hot encoded adjectives
grouped_counts = adjective_counts.groupby(['Gender', 'Race']).sum()

# Perform a chi-squared test for race and gender
chi2_race_gender = chi2_contingency(grouped_counts)
print("Chi-squared test for adjectives by Race and Gender:", chi2_race_gender)

Chi-squared test for adjectives by Race and Gender: Chi2ContingencyResult(statistic=895.615193807821, pvalue=4.269785031385703e-57, dof=315, expected_freq=array([[ 634.8,  455.1,  577.2,   28.8, 1264.5, 2881.2,  298.2,  102.6,
         342.3,  517.2,  587.7,   35.4,   28.8,  965.4,   28.8,   67.2,
        1833.6, 1281.3,   69.9,  105.3,  388.5,  349.8,  326.4,  328.5,
         177. ,  511.5,  106.2, 2308.8, 2380.8,  284.7,   69. , 2018.4,
         223.5,  318.3,  141.6,  308.7],
       [ 634.8,  455.1,  577.2,   28.8, 1264.5, 2881.2,  298.2,  102.6,
         342.3,  517.2,  587.7,   35.4,   28.8,  965.4,   28.8,   67.2,
        1833.6, 1281.3,   69.9,  105.3,  388.5,  349.8,  326.4,  328.5,
         177. ,  511.5,  106.2, 2308.8, 2380.8,  284.7,   69. , 2018.4,
         223.5,  318.3,  141.6,  308.7],
       [ 634.8,  455.1,  577.2,   28.8, 1264.5, 2881.2,  298.2,  102.6,
         342.3,  517.2,  587.7,   35.4,   28.8,  965.4,   28.8,   67.2,
        1833.6, 1281.3,   69.9,  105.3,  38

In [6]:
import pandas as pd
from collections import Counter

# Combine the adjectives into a single column
Final_Data['adjectives'] = Final_Data[['adjective1', 'adjective2', 'adjective3']].apply(lambda x: ' '.join(x.dropna().astype(str)).lower(), axis=1)

# Helper function to get the top 10 adjectives
def get_top_adjectives(df, group_by_col):
    top_adjectives = []
    grouped_data = df.groupby(group_by_col, observed=False)['adjectives'].apply(lambda x: ' '.join(x)).reset_index()

    for _, row in grouped_data.iterrows():
        group = row[group_by_col]
        adjectives = row['adjectives'].split()
        most_common = Counter(adjectives).most_common(5)
        top_adjectives.append([group] + [f"{adj} ({count})" for adj, count in most_common])

    # Create a DataFrame for display
    columns = [group_by_col] + [f"Top {i+1}" for i in range(5)]
    return pd.DataFrame(top_adjectives, columns=columns)

# Top adjectives by Race
top_adjectives_race = get_top_adjectives(Final_Data, 'Race')

# Top adjectives by Gender
top_adjectives_gender = get_top_adjectives(Final_Data, 'Gender')

# Top adjectives by Race and Gender combination
Final_Data['Race_Gender'] = Final_Data['Race'].astype(str) + ' / ' + Final_Data['Gender'].astype(str)
top_adjectives_race_gender = get_top_adjectives(Final_Data, 'Race_Gender')

# Styling function for Excel-like table
def style_table(df):
    return df.style.set_table_styles(
        [
            {'selector': 'th', 'props': [('font-weight', 'bold'), ('background-color', '#D9EAD3')]},
            {'selector': 'td', 'props': [('text-align', 'center')]},
        ]
    ).set_properties(**{'text-align': 'center'})

# Style the tables
styled_race = style_table(top_adjectives_race)
styled_gender = style_table(top_adjectives_gender)
styled_race_gender = style_table(top_adjectives_race_gender)

# Display the styled tables
styled_race
styled_gender
styled_race_gender


Unnamed: 0,Race_Gender,Top 1,Top 2,Top 3,Top 4,Top 5
0,Asian/Pacific Islander / Female,dedicated (951),reliable (785),professional (760),skilled (684),hardworking (603)
1,Asian/Pacific Islander / Male,dedicated (958),reliable (821),professional (768),skilled (665),hardworking (613)
2,Black / Female,dedicated (975),reliable (786),professional (784),skilled (685),hardworking (622)
3,Black / Male,dedicated (974),professional (786),reliable (783),skilled (659),hardworking (618)
4,Hispanic / Female,dedicated (957),reliable (782),professional (762),skilled (697),hardworking (601)
5,Hispanic / Male,dedicated (978),reliable (809),professional (742),skilled (673),hardworking (618)
6,Native American / Female,dedicated (983),reliable (812),professional (807),skilled (731),hardworking (611)
7,Native American / Male,dedicated (963),reliable (772),professional (744),skilled (676),hardworking (606)
8,White / Female,dedicated (954),reliable (779),professional (760),skilled (640),hardworking (624)
9,White / Male,dedicated (911),reliable (807),professional (783),skilled (618),hardworking (596)


In [7]:
styled_gender


Unnamed: 0,Gender,Top 1,Top 2,Top 3,Top 4,Top 5
0,Female,dedicated (4820),reliable (3944),professional (3873),skilled (3437),hardworking (3061)
1,Male,dedicated (4784),reliable (3992),professional (3823),skilled (3291),hardworking (3051)


In [8]:
styled_race

Unnamed: 0,Race,Top 1,Top 2,Top 3,Top 4,Top 5
0,Asian/Pacific Islander,dedicated (1909),reliable (1606),professional (1528),skilled (1349),hardworking (1216)
1,Black,dedicated (1949),professional (1570),reliable (1569),skilled (1344),hardworking (1240)
2,Hispanic,dedicated (1935),reliable (1591),professional (1504),skilled (1370),hardworking (1219)
3,Native American,dedicated (1946),reliable (1584),professional (1551),skilled (1407),hardworking (1217)
4,White,dedicated (1865),reliable (1586),professional (1543),skilled (1258),hardworking (1220)


In [9]:
import pandas as pd
from collections import Counter

# Combine the adjectives into a single column
Final_Data['adjectives'] = Final_Data[['adjective1', 'adjective2', 'adjective3']].apply(lambda x: ' '.join(x.dropna().astype(str)).lower(), axis=1)

# Helper function to calculate adjective frequencies for each demographic group
def calculate_adjective_frequencies(df, group_by_col):
    freq_data = {}
    grouped_data = df.groupby(group_by_col)['adjectives'].apply(lambda x: ' '.join(x)).reset_index()

    for _, row in grouped_data.iterrows():
        group = row[group_by_col]
        adjectives = row['adjectives'].split()
        most_common = Counter(adjectives)
        total_count = sum(most_common.values())
        # Calculate term frequency (TF)
        freq_data[group] = {adj: count / total_count for adj, count in most_common.items()}

    return freq_data

# Calculate frequencies by Race and Gender
freq_by_race = calculate_adjective_frequencies(Final_Data, 'Race')
freq_by_gender = calculate_adjective_frequencies(Final_Data, 'Gender')
Final_Data['Race_Gender'] = Final_Data['Race'].astype(str) + ' / ' + Final_Data['Gender'].astype(str)
freq_by_race_gender = calculate_adjective_frequencies(Final_Data, 'Race_Gender')

# Helper function to find the most disproportionate adjectives
def find_disproportionate_adjectives(freq_dict):
    disproportionate_adjectives = {}

    groups = list(freq_dict.keys())
    for group in groups:
        adjectives = freq_dict[group]
        other_groups = [g for g in groups if g != group]
        group_results = []

        for adj, freq in adjectives.items():
            # Calculate average frequency of the adjective in other groups
            avg_other_freq = sum(freq_dict[other_group].get(adj, 0) for other_group in other_groups) / len(other_groups)
            if avg_other_freq > 0:
                # Calculate the disproportionate ratio
                ratio = freq / avg_other_freq
                if ratio > 1:  # Only consider if it is disproportionately higher
                    group_results.append((adj, round(ratio, 2)))

        # Sort by the ratio and take the top 5
        top_5 = sorted(group_results, key=lambda x: x[1], reverse=True)[:5]
        disproportionate_adjectives[group] = top_5

    return disproportionate_adjectives

# Find the most disproportionate adjectives by Race
disproportionate_adjectives_race = find_disproportionate_adjectives(freq_by_race)

# Find the most disproportionate adjectives by Gender
disproportionate_adjectives_gender = find_disproportionate_adjectives(freq_by_gender)

# Find the most disproportionate adjectives by Race and Gender combination
disproportionate_adjectives_race_gender = find_disproportionate_adjectives(freq_by_race_gender)

# Display the results
print("\nMost Disproportionate Adjectives by Race:")
for race, adjectives in disproportionate_adjectives_race.items():
    print(f"{race}: {', '.join([f'{adj} ({ratio}x)' for adj, ratio in adjectives])}")

print("\nMost Disproportionate Adjectives by Gender:")
for gender, adjectives in disproportionate_adjectives_gender.items():
    print(f"{gender}: {', '.join([f'{adj} ({ratio}x)' for adj, ratio in adjectives])}")

print("\nMost Disproportionate Adjectives by Race and Gender:")
for race_gender, adjectives in disproportionate_adjectives_race_gender.items():
    print(f"{race_gender}: {', '.join([f'{adj} ({ratio}x)' for adj, ratio in adjectives])}")



Most Disproportionate Adjectives by Race:
Asian/Pacific Islander: engaging (1.26x), technical (1.13x), precise (1.12x), proactive (1.08x), meticulous (1.07x)
Black: skillful (1.21x), diligent (1.16x), responsible (1.14x), dynamic (1.12x), motivated (1.1x)
Hispanic: expressive (1.33x), energetic (1.3x), communicative (1.19x), educated (1.13x), experienced (1.09x)
Native American: problem-solving (1.3x), expressive (1.19x), communicative (1.19x), methodical (1.18x), adaptable (1.08x)
White: communicative (1.49x), energetic (1.36x), engaging (1.33x), practical (1.18x), methodical (1.18x)

Most Disproportionate Adjectives by Gender:
Female: expressive (1.23x), hands-on (1.17x), versatile (1.13x), practical (1.11x), diligent (1.09x)
Male: engaging (1.53x), skillful (1.18x), communicative (1.13x), meticulous (1.09x), dynamic (1.09x)

Most Disproportionate Adjectives by Race and Gender:
Asian/Pacific Islander / Female: practical (1.23x), technical (1.16x), engaging (1.16x), precise (1.15x), 