# Data Analysis: Software Engineer Earnings

This notebook performs inferential and statistical analysis to address the research question:

**"What demographic factors influence median weekly earnings for software engineers in the U.S. tech industry?"**

We build on our data exploration by using statistical modeling to assess how gender, education, race, and age contribute to earnings differences.


In [None]:
# load libraries and dataset
import pandas as pd
from scipy.stats import ttest_ind, f_oneway
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv(
    "/Users/user/Documents/mit_stuff/CDSP_GROUP_11/ET6-CDSP-group-11-repo/1_datasets/software_engineers_employment_dataset_cleaned.csv"
)

In [4]:
# Gender based earnings (t-test)
male = df[df["SEX"] == 1]["weekly_earnings"]
female = df[df["SEX"] == 2]["weekly_earnings"]

t_stat, p_val = ttest_ind(male, female, equal_var=False)
print(f"T-statistic: {t_stat:.2f}, p-value: {p_val:.4f}")

T-statistic: 20.08, p-value: 0.0000


In [5]:
# Education Level Earnings (ANOVA)
edu_groups = [group["weekly_earnings"].values for _, group in df.groupby("EDUC")]
f_stat, p_val = f_oneway(*edu_groups)
print(f"F-statistic: {f_stat:.2f}, p-value: {p_val:.4f}")

F-statistic: 100.50, p-value: 0.0000


In [6]:
# Regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

X = df[["AGE", "SEX", "EDUC", "RACE"]]
y = df["weekly_earnings"]

# One-hot encode categorical variables
categorical_features = ["SEX", "EDUC", "RACE"]
numeric_features = ["AGE"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), categorical_features),
        ("num", "passthrough", numeric_features),
    ]
)

# Build pipeline
pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", LinearRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(f"R² Score: {r2_score(y_test, y_pred):.4f}")

R² Score: 0.1107
