# This is where we will specifically put all code and answers for our second analysis question

## Question: If S&P is increasing or decreasing, which demographic is most closely related to the S&P performance?

In [4]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import matplotlib as plt

ModuleNotFoundError: No module named 'pandas'

In [None]:
browser = webdriver.Chrome()
browser.get("https://www.bls.gov/charts/employment-situation/civilian-unemployment-rate.htm")
browser.maximize_window()
wait = WebDriverWait(browser, 15)
show_table_button = wait.until(
    EC.element_to_be_clickable((By.LINK_TEXT, "Show table"))
)

show_table_button.click()

time.sleep(2)

table = browser.find_element(By.TAG_NAME, "table")
rows = table.find_elements(By.TAG_NAME, "tr")

month_years, total_rates, men_rates, women_rates, teen_rates, white_rates, black_rates, asian_rates, latino_rates = ([] for _ in range(9))

for row in rows[1:]:
    cols = row.find_elements(By.TAG_NAME, "td")
    if len(cols) == 8:
        th = row.find_element(By.TAG_NAME, "th")
        month_year = th.find_element(By.CLASS_NAME, "sub0").text
        month_years.append(month_year)
            total_rates.append(cols[0].text)
        men_rates.append(cols[1].text)
        women_rates.append(cols[2].text)
        teen_rates.append(cols[3].text)
        white_rates.append(cols[4].text)
        black_rates.append(cols[5].text)
        asian_rates.append(cols[6].text)
        latino_rates.append(cols[7].text)

unemployment_df = pd.DataFrame({
    "Date": month_years,
    "Total Rate": total_rates,
    "Male Rate": men_rates,
    "Female Rate": women_rates,
    "Teen Rate": teen_rates,
    "White Rate": white_rates,
    "Black Rate": black_rates,
    "Asian Rate": asian_rates,
    "Hispanic Rate": latino_rates
})

browser.quit()

print(unemployment_df.head(10))

In [None]:
sp500_df = pd.read_csv("spy.csv")
sp500_df.head(10)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# --- Load your data (if not already loaded) ---
# unemployment_df = pd.read_csv('your_unemployment_file.csv')
# sp500_df = pd.read_csv('your_sp500_file.csv')

# --- 1. Convert 'Date' columns to datetime ---
unemployment_df['Date'] = pd.to_datetime(unemployment_df['Date'], format='%Y-%m-%d')
sp500_df['Date'] = pd.to_datetime(sp500_df['Date'], format='%Y-%m-%d')

# --- 2. Clean unemployment rate columns (remove % and convert to float) ---
rate_cols = [col for col in unemployment_df.columns if col != 'Date']

for col in rate_cols:
    unemployment_df[col] = pd.to_numeric(unemployment_df[col].astype(str).str.replace('%', ''), errors='coerce')

# --- 3. Merge unemployment and S&P 500 data on 'Date' ---
merged_df = pd.merge(unemployment_df, sp500_df[['Date', 'Close']], on='Date')

# --- 4. Sort by date and create S&P trend direction ---
merged_df.sort_values('Date', inplace=True)
merged_df['SP_Trend'] = merged_df['Close'].diff().apply(lambda x: 'Increase' if x > 0 else 'Decrease')

In [None]:

plt.figure(figsize=(12, 6))
sns.lineplot(data=merged_df, x='Date', y='Close', label='S&P 500')
sns.lineplot(data=merged_df, x='Date', y='Female Rate', label='Female Unemployment')
sns.lineplot(data=merged_df, x='Date', y='Male Rate', label='Male Unemployment')
sns.lineplot(data=merged_df, x='Date', y='Teen Rate', label='Teen Unemployment')
sns.lineplot(data=merged_df, x='Date', y='White Rate', label='White Unemployment')
sns.lineplot(data=merged_df, x='Date', y='Black Rate', label='Black Unemployment')
sns.lineplot(data=merged_df, x='Date', y='Asian Rate', label='Asian Unemployment')
sns.lineplot(data=merged_df, x='Date', y='Hispanic Rate', label='Hispanic Unemployment')
plt.title('S&P 500 vs Unemployment Rates Over Time')
plt.legend()
plt.show()

In [None]:
features = ['Male Rate', 'Female Rate', 'Teen Rate', 'White Rate', 'Black Rate', 'Asian Rate', 'Hispanic Rate']
X = merged_df[features]
y = merged_df['SP_Trend']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Feature importance
importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
importances

In [None]:
rate_cols = [col for col in unemployment_df.columns if col != 'Date']
correlations = merged_df[rate_cols + ['Close']].corr()['Close'].drop('Close')

# Create bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=correlations.index, y=correlations.values, palette='viridis')
plt.title('Correlation of Unemployment Rates with S&P 500 Close')
plt.ylabel('Correlation Coefficient')
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Create new columns based on whether the rate exceeds the median for each demographic group
merged_df['High_Teen_Unemployment'] = merged_df['Teen Rate'] > merged_df['Teen Rate'].median()
merged_df['High_Male_Unemployment'] = merged_df['Male Rate'] > merged_df['Male Rate'].median()
merged_df['High_Female_Unemployment'] = merged_df['Female Rate'] > merged_df['Female Rate'].median()
merged_df['High_White_Unemployment'] = merged_df['White Rate'] > merged_df['White Rate'].median()
merged_df['High_Black_Unemployment'] = merged_df['Black Rate'] > merged_df['Black Rate'].median()
merged_df['High_Asian_Unemployment'] = merged_df['Asian Rate'] > merged_df['Asian Rate'].median()
merged_df['High_Hispanic_Unemployment'] = merged_df['Hispanic Rate'] > merged_df['Hispanic Rate'].median()

# Ensure that 'SP_Trend' is categorical
merged_df['SP_Trend'] = merged_df['SP_Trend'].astype('category')

# List of demographic columns and corresponding "High" Unemployment columns
demographic_columns = [
    ('Teen', 'High_Teen_Unemployment'),
    ('Male', 'High_Male_Unemployment'),
    ('Female', 'High_Female_Unemployment'),
    ('White', 'High_White_Unemployment'),
    ('Black', 'High_Black_Unemployment'),
    ('Asian', 'High_Asian_Unemployment'),
    ('Hispanic', 'High_Hispanic_Unemployment')
]
# Perform the chi-square test for each rate and print the results
for rate, high_column in demographic_columns:
    contingency = pd.crosstab(merged_df['SP_Trend'], merged_df[high_column])
    chi2, p, dof, expected = chi2_contingency(contingency)
    
    print(f"Results for {rate} Unemployment:")
    print(f"Chi-square statistic: {chi2}")
    print(f"P-value: {p}")
    print(f"Degrees of freedom: {dof}")
    print(f"Expected frequencies:\n{expected}\n")

In [None]:
for col in features:
    merged_df[f'{col}_lag1'] = merged_df[col].shift(1)

# Drop rows with NaNs introduced by lagging
merged_df_lagged = merged_df.dropna(subset=[f'{col}_lag1' for col in features])


In [None]:
# run new classificaiton 

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
lagged_features = [f'{col}_lag1' for col in features]
X_lag = merged_df_lagged[lagged_features]
y_lag = merged_df_lagged['SP_Trend']

#set test at 20%
X_train_lag, X_test_lag, y_train_lag, y_test_lag = train_test_split(X_lag, y_lag, test_size=0.2, random_state=42)

#NEED TO CHECK!!
#to predict continuous values, may need to use regression model (like RandomForestRegressor) instead of a classifier
#not confident tho
model_lag = RandomForestClassifier(random_state=42)
model_lag.fit(X_train_lag, y_train_lag)

y_pred_lag = model_lag.predict(X_test_lag)
print(classification_report(y_test_lag, y_pred_lag))

# Feature importances for lagged features
importances_lag = pd.Series(model_lag.feature_importances_, index=lagged_features).sort_values(ascending=False)
print(importances_lag)


In [None]:
# create a continuous target 

# Calculate S&P percent change and drop missing values
merged_df['SP_Change'] = merged_df['Close'].pct_change()
merged_df = merged_df.dropna(subset=['SP_Change'])


In [None]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
print("Logistic Coefficients:\n", pd.Series(log_model.coef_[0], index=features))


In [None]:
# ROC Curve (for binary classificaiton) since we are predicting an increase or decrease

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

# Encode target to binary
le = LabelEncoder()
y_bin = le.fit_transform(y)  # 'Decrease'=0, 'Increase'=1

# Split and fit model again with encoded target
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(X, y_bin, test_size=0.2, random_state=42)
model_bin = RandomForestClassifier(random_state=42)
model_bin.fit(X_train_bin, y_train_bin)
y_proba = model_bin.predict_proba(X_test_bin)[:, 1]  # Probability of 'Increase'

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test_bin, y_proba)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test_bin, y_proba):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Predicting S&P Trend")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
print(merged_df['SP_Trend'].value_counts())

In [None]:
from sklearn.metrics import roc_auc_score

# Original predicted probabilities
y_proba = model.predict_proba(X_test)[:, 1]

# Flipped probabilities
y_proba_flipped = 1 - y_proba

# Recompute AUC for flipped predictions
roc_auc_flipped = roc_auc_score(y_test, y_proba_flipped)
print(f"Flipped AUC: {roc_auc_flipped:.2f}")


In [None]:
# print new AUC gra
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# --- Step 1: Encode the target variable as binary ---
# 'Increase' = 1 (positive class), 'Decrease' = 0
merged_df['SP_Trend'] = merged_df['SP_Trend'].map({'Increase': 1, 'Decrease': 0})

# --- Step 2: Select features and optionally create lagged features ---
features = ['Male Rate', 'Female Rate', 'Teen Rate', 'White Rate', 'Black Rate', 'Asian Rate', 'Hispanic Rate']

# Create lagged features
for col in features:
    merged_df[f'{col}_lag1'] = merged_df[col].shift(1)

# Drop rows with NaNs introduced by lagging
merged_df_lagged = merged_df.dropna(subset=[f'{col}_lag1' for col in features])

# Define feature set and target
lagged_features = [f'{col}_lag1' for col in features]
X = merged_df_lagged[lagged_features]
y = merged_df_lagged['SP_Trend']
# --- Step 3: Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Step 4: Train Random Forest Classifier ---
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# --- Step 5: Predict and Evaluate ---
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

# --- Step 6: AUC and ROC Curve ---
roc_auc = roc_auc_score(y_test, y_proba)
print(f"AUC: {roc_auc:.2f}")

# Check flipped AUC
roc_auc_flipped = roc_auc_score(y_test, 1 - y_proba)
print(f"Flipped AUC: {roc_auc_flipped:.2f}")

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - S&P Trend Prediction')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Explanation of the Change 

Metric:                         Value:	Meaning:
AUC  	                        0.41	Poor ranking of true class 1
Flipped AUC	                    0.59	Slightly better ranking when 1 and 0 are reversed
Precision/Recall for class 1	0.74	Model is biased toward class 1 and gets most right
Precision/Recall for class 0	0.25	Model struggles to identify minority class (0)

In [None]:
# confusion matrix

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred_lag = model_lag.predict(X_test_lag)

disp = ConfusionMatrixDisplay(
    confusion_matrix=confusion_matrix(y_test_lag, y_pred_lag),
    display_labels=['Increase', 'Decrease']  
)


plt.figure(figsize=(5, 4))
disp.plot(cmap="Blues", values_format='d')
plt.title("Confusion Matrix - S&P Trend Prediction (with Lags)")
plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Plot feature importances
importances = pd.Series(model_lag.feature_importances_, index=lagged_features).sort_values()

plt.figure(figsize=(10, 6))
importances.plot(kind='barh', color='skyblue')
plt.title("Feature Importances - Random Forest (Original + Lagged)")
plt.xlabel("Importance")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_lagged)

# Fit logistic regression
log_reg = LogisticRegression()
log_reg.fit(X_scaled, y_lagged)

# Coefficients and direction of influence
coef_df = pd.DataFrame({
    'Feature': lagged_features,
    'Coefficient': log_reg.coef_[0]
}).sort_values(by='Coefficient')

plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coef_df, palette='coolwarm')
plt.title("Logistic Regression Coefficients (S&P Trend Prediction)")
plt.grid(True)
plt.tight_layout()
plt.show()