# This is where we will specifically put all code and answers for our second analysis question

## Question: If S&P is increasing or decreasing, which demographic is most closely related to the S&P performance?

Below is all inputs needed to run our code.

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error, r2_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.stats import chi2_contingency

Below is our web scraping process.

In [None]:
browser = webdriver.Chrome()
browser.get("https://www.bls.gov/charts/employment-situation/civilian-unemployment-rate.htm")
browser.maximize_window()
wait = WebDriverWait(browser, 15)
show_table_button = wait.until(
    EC.element_to_be_clickable((By.LINK_TEXT, "Show table"))
)

show_table_button.click()

time.sleep(2)

table = browser.find_element(By.TAG_NAME, "table")
rows = table.find_elements(By.TAG_NAME, "tr")

month_years, total_rates, men_rates, women_rates, teen_rates, white_rates, black_rates, asian_rates, latino_rates = ([] for _ in range(9))

for row in rows[1:]:
    cols = row.find_elements(By.TAG_NAME, "td")
    if len(cols) == 8:
        th = row.find_element(By.TAG_NAME, "th")
        month_year = th.find_element(By.CLASS_NAME, "sub0").text
        month_years.append(month_year)
        total_rates.append(cols[0].text)
        men_rates.append(cols[1].text)
        women_rates.append(cols[2].text)
        teen_rates.append(cols[3].text)
        white_rates.append(cols[4].text)
        black_rates.append(cols[5].text)
        asian_rates.append(cols[6].text)
        latino_rates.append(cols[7].text)
        unemployment_df = pd.DataFrame({
    "Date": month_years,
    "Total Rate": total_rates,
    "Male Rate": men_rates,
    "Female Rate": women_rates,
    "Teen Rate": teen_rates,
    "White Rate": white_rates,
    "Black Rate": black_rates,
    "Asian Rate": asian_rates,
    "Hispanic Rate": latino_rates
})

browser.quit()

print(unemployment_df.head(10))

Below is the reading of our S&P 500 data and then merged onto the unemployment data.

In [None]:
sp500_df = pd.read_csv("spy.csv")
sp500_df.head(10)

In [None]:
unemployment_df['Date'] = pd.to_datetime(unemployment_df['Date'], errors='coerce')
sp500_df['Date'] = pd.to_datetime(sp500_df['Date'], errors='coerce')

rate_cols = [col for col in unemployment_df.columns if col != 'Date']

for col in rate_cols:
    unemployment_df[col] = pd.to_numeric(unemployment_df[col].astype(str).str.replace('%', ''), errors='coerce')

# Merging data
merged_df = pd.merge(unemployment_df, sp500_df[['Date', 'Close']], on='Date')

merged_df.sort_values('Date', inplace=True)
merged_df['SP_Trend'] = merged_df['Close'].diff().apply(lambda x: 'Increase' if x > 0 else 'Decrease')

# Check for any issues in the Date column
print(merged_df['Date'].head())


This code creates a multi-line time series plot that visualizes the S&P 500 closing price alongside various unemployment rates (by demographic group) over time. Each line represents a different variable, allowing for visual comparison of market performance and unemployment trends across groups, all plotted against the same time axis.

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=merged_df, x='Date', y='Close', label='S&P 500')
sns.lineplot(data=merged_df, x='Date', y='Female Rate', label='Female Unemployment')
sns.lineplot(data=merged_df, x='Date', y='Male Rate', label='Male Unemployment')
sns.lineplot(data=merged_df, x='Date', y='Teen Rate', label='Teen Unemployment')
sns.lineplot(data=merged_df, x='Date', y='White Rate', label='White Unemployment')
sns.lineplot(data=merged_df, x='Date', y='Black Rate', label='Black Unemployment')
sns.lineplot(data=merged_df, x='Date', y='Asian Rate', label='Asian Unemployment')
sns.lineplot(data=merged_df, x='Date', y='Hispanic Rate', label='Hispanic Unemployment')
plt.title('S&P 500 vs Unemployment Rates Over Time')
plt.legend()
plt.show()

This code trains a Random Forest classifier to predict SP_Trend (e.g., an indicator of S&P 500 market direction) using various demographic unemployment rates as features. It splits the data into training and test sets, fits the model, evaluates its performance using a classification report, and displays the importance of each feature based on how much it contributes to the model’s predictions.

In [None]:
features = ['Male Rate', 'Female Rate', 'Teen Rate', 'White Rate', 'Black Rate', 'Asian Rate', 'Hispanic Rate']
X = merged_df[features]
y = merged_df['SP_Trend']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
importances

This code calculates the correlation between each unemployment rate (excluding the 'Date' column) and the S&P 500 closing price, then visualizes the results as a bar plot. Each bar represents how strongly a specific unemployment rate is linearly related to the S&P 500 close, making it easy to compare the strength and direction of these relationships across demographic groups.

In [None]:
rate_cols = [col for col in unemployment_df.columns if col != 'Date']
correlations = merged_df[rate_cols + ['Close']].corr()['Close'].drop('Close')

plt.figure(figsize=(10, 6))
sns.barplot(x=correlations.index, y=correlations.values, palette='viridis')
plt.title('Correlation of Unemployment Rates with S&P 500 Close')
plt.ylabel('Correlation Coefficient')
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

This code creates binary indicators for each demographic group based on whether their unemployment rate is above the median, then conducts a chi-square test of independence to assess whether there is a statistically significant association between high unemployment (by group) and S&P 500 trend (SP_Trend). For each demographic group, it prints the chi-square statistic, p-value, degrees of freedom, and expected frequencies, helping determine whether stock market movements are linked to elevated unemployment in specific populations.

In [None]:
merged_df['High_Teen_Unemployment'] = merged_df['Teen Rate'] > merged_df['Teen Rate'].median()
merged_df['High_Male_Unemployment'] = merged_df['Male Rate'] > merged_df['Male Rate'].median()
merged_df['High_Female_Unemployment'] = merged_df['Female Rate'] > merged_df['Female Rate'].median()
merged_df['High_White_Unemployment'] = merged_df['White Rate'] > merged_df['White Rate'].median()
merged_df['High_Black_Unemployment'] = merged_df['Black Rate'] > merged_df['Black Rate'].median()
merged_df['High_Asian_Unemployment'] = merged_df['Asian Rate'] > merged_df['Asian Rate'].median()
merged_df['High_Hispanic_Unemployment'] = merged_df['Hispanic Rate'] > merged_df['Hispanic Rate'].median()

merged_df['SP_Trend'] = merged_df['SP_Trend'].astype('category')

demographic_columns = [
    ('Teen', 'High_Teen_Unemployment'),
    ('Male', 'High_Male_Unemployment'),
    ('Female', 'High_Female_Unemployment'),
    ('White', 'High_White_Unemployment'),
    ('Black', 'High_Black_Unemployment'),
    ('Asian', 'High_Asian_Unemployment'),
    ('Hispanic', 'High_Hispanic_Unemployment')
]

for rate, high_column in demographic_columns:
    contingency = pd.crosstab(merged_df['SP_Trend'], merged_df[high_column])
    chi2, p, dof, expected = chi2_contingency(contingency)
    
    print(f"Results for {rate} Unemployment:")
    print(f"Chi-square statistic: {chi2}")
    print(f"P-value: {p}")
    print(f"Degrees of freedom: {dof}")
    print(f"Expected frequencies:\n{expected}\n")

In [None]:
for col in features:
    merged_df[f'{col}_lag1'] = merged_df[col].shift(1)

merged_df_lagged = merged_df.dropna(subset=[f'{col}_lag1' for col in features])

This code uses lagged versions of unemployment rate features (i.e., values from the previous time period) to train a Random Forest classifier that predicts SP_Trend. It splits the data into training and testing sets, fits the model, evaluates prediction performance with a classification report, and prints the importance of each lagged feature, helping identify which past unemployment rates are most influential in forecasting S&P 500 movements.

In [None]:
lagged_features = [f'{col}_lag1' for col in features]
X_lag = merged_df_lagged[lagged_features]
y_lag = merged_df_lagged['SP_Trend']

X_train_lag, X_test_lag, y_train_lag, y_test_lag = train_test_split(X_lag, y_lag, test_size=0.2, random_state=42)

model_lag = RandomForestClassifier(random_state=42)
model_lag.fit(X_train_lag, y_train_lag)

y_pred_lag = model_lag.predict(X_test_lag)
print(classification_report(y_test_lag, y_pred_lag))

importances_lag = pd.Series(model_lag.feature_importances_, index=lagged_features).sort_values(ascending=False)
print(importances_lag)

In [None]:
merged_df['SP_Change'] = merged_df['Close'].pct_change()
merged_df = merged_df.dropna(subset=['SP_Change'])

This code fits a logistic regression model using the training data (X_train, y_train) to predict SP_Trend, then prints the model's learned coefficients for each unemployment rate feature. These coefficients indicate the direction and strength of each variable's influence on the likelihood of the S&P 500 trending upward.

In [None]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Coefficients:\n", pd.Series(log_model.coef_[0], index=features))


This code encodes the categorical target SP_Trend into binary form, trains a Random Forest classifier, and evaluates its probabilistic predictions using an ROC curve. It plots the True Positive Rate against the False Positive Rate at various thresholds and displays the AUC score, which summarizes the model's overall ability to distinguish between the two classes.

In [None]:
le = LabelEncoder()
y_bin = le.fit_transform(y)

X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(X, y_bin, test_size=0.2, random_state=42)
model_bin = RandomForestClassifier(random_state=42)
model_bin.fit(X_train_bin, y_train_bin)
y_proba = model_bin.predict_proba(X_test_bin)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test_bin, y_proba)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test_bin, y_proba):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Predicting S&P Trend")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
print(merged_df['SP_Trend'].value_counts())

This code calculates the predicted probabilities for the positive class using a trained model, then flips those probabilities to represent the negative class. It computes and prints the ROC AUC score based on the flipped probabilities, which evaluates how well the model distinguishes the negative class (0) instead of the positive class (1).

In [None]:
y_proba = model.predict_proba(X_test)[:, 1]

y_proba_flipped = 1 - y_proba

roc_auc_flipped = roc_auc_score(y_test, y_proba_flipped)
print(f"Flipped AUC: {roc_auc_flipped:.2f}")

This code preprocesses the data by mapping SP_Trend to binary values, then creates lagged versions of various unemployment rate features to predict S&P 500 trend (SP_Trend). It trains a Random Forest classifier on these lagged features, evaluates the model's performance using a classification report and AUC (Area Under the Curve) score, and computes an alternative AUC score after flipping the predicted probabilities. Finally, it visualizes the ROC curve to assess how well the model discriminates between the two trends (Increase vs. Decrease) of the S&P 500.

In [None]:
merged_df['SP_Trend'] = merged_df['SP_Trend'].map({'Increase': 1, 'Decrease': 0})

features = ['Male Rate', 'Female Rate', 'Teen Rate', 'White Rate', 'Black Rate', 'Asian Rate', 'Hispanic Rate']

for col in features:
    merged_df[f'{col}_lag1'] = merged_df[col].shift(1)

merged_df_lagged = merged_df.dropna(subset=[f'{col}_lag1' for col in features])

lagged_features = [f'{col}_lag1' for col in features]
X = merged_df_lagged[lagged_features]
y = merged_df_lagged['SP_Trend']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_proba)
print(f"AUC: {roc_auc:.2f}")

roc_auc_flipped = roc_auc_score(y_test, 1 - y_proba)
print(f"Flipped AUC: {roc_auc_flipped:.2f}")
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - S&P Trend Prediction')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

Metric:                         Value:	Meaning:
AUC  	                        0.41	Poor ranking of true class 1
Flipped AUC	                    0.59	Slightly better ranking when 1 and 0 are reversed
Precision/Recall for class 1	0.74	Model is biased toward class 1 and gets most right
Precision/Recall for class 0	0.25	Model struggles to identify minority class (0)

This code generates a confusion matrix to evaluate the performance of the lagged version of the Random Forest classifier. It computes the confusion matrix for the predicted (y_pred_lag) and actual (y_test_lag) SP_Trend values, then displays the matrix with labels for "Increase" and "Decrease." The plot uses a blue color map to visualize the matrix, which shows how well the model classified the S&P 500 trend.

In [None]:
y_pred_lag = model_lag.predict(X_test_lag)

disp = ConfusionMatrixDisplay(
    confusion_matrix=confusion_matrix(y_test_lag, y_pred_lag),
    display_labels=['Increase', 'Decrease']  
)


plt.figure(figsize=(5, 4))
disp.plot(cmap="Blues", values_format='d')
plt.title("Confusion Matrix - S&P Trend Prediction (with Lags)")
plt.tight_layout()
plt.show()

This code visualizes the feature importances from the lagged Random Forest model by plotting a horizontal bar chart. The bar chart shows the relative importance of each lagged feature in predicting the S&P 500 trend, with the features sorted by importance. The chart helps identify which lagged unemployment rate features contribute most to the model's predictions.

In [None]:
importances = pd.Series(model_lag.feature_importances_, index=lagged_features).sort_values()

plt.figure(figsize=(10, 6))
importances.plot(kind='barh', color='skyblue')
plt.title("Feature Importances - Random Forest (Original + Lagged)")
plt.xlabel("Importance")
plt.grid(True)
plt.tight_layout()
plt.show()

This code scales the lagged feature data using StandardScaler, then fits a logistic regression model to predict the S&P 500 trend (y_lagged). It extracts the model's coefficients for each feature and visualizes them in a horizontal bar plot, where the length of each bar indicates the strength and direction of the feature's effect on the outcome. The plot helps interpret which lagged features are most influential in predicting the S&P 500 trend.