# This is where we will specifically put all code and answers for our first analysis question

## Question: How do fluctuations in unemployment rates across various demographic groups correlate with the performance of the S&P 500 index?

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import matplotlib as plt

In [None]:
browser = webdriver.Chrome()
browser.get("https://www.bls.gov/charts/employment-situation/civilian-unemployment-rate.htm")
browser.maximize_window()
wait = WebDriverWait(browser, 15)
show_table_button = wait.until(
    EC.element_to_be_clickable((By.LINK_TEXT, "Show table"))
)

show_table_button.click()

time.sleep(2)

table = browser.find_element(By.TAG_NAME, "table")
rows = table.find_elements(By.TAG_NAME, "tr")

month_years, total_rates, men_rates, women_rates, teen_rates, white_rates, black_rates, asian_rates, latino_rates = ([] for _ in range(9))

for row in rows[1:]:
    cols = row.find_elements(By.TAG_NAME, "td")
    if len(cols) == 8:
        th = row.find_element(By.TAG_NAME, "th")
        month_year = th.find_element(By.CLASS_NAME, "sub0").text
        month_years.append(month_year)
        total_rates.append(cols[0].text)
        men_rates.append(cols[1].text)
        women_rates.append(cols[2].text)
        teen_rates.append(cols[3].text)
        white_rates.append(cols[4].text)
        black_rates.append(cols[5].text)
        asian_rates.append(cols[6].text)
        latino_rates.append(cols[7].text)

unemployment_df = pd.DataFrame({
    "Date": month_years,
    "Total Rate": total_rates,
    "Male Rate": men_rates,
    "Female Rate": women_rates,
    "Teen Rate": teen_rates,
    "White Rate": white_rates,
    "Black Rate": black_rates,
    "Asian Rate": asian_rates,
    "Hispanic Rate": latino_rates
})

browser.quit()

print(unemployment_df.head(10))

In [None]:
sp500_df = pd.read_csv("spy.csv")
sp500_df.head(10)

In [None]:
unemployment_df["Date"] = pd.to_datetime(unemployment_df["Date"]).dt.strftime('%b %Y')
unemployment_df["Total Rate"] = pd.to_numeric(unemployment_df["Total Rate"], errors='coerce')
unemployment_df["Male Rate"] = pd.to_numeric(unemployment_df["Male Rate"], errors='coerce')
unemployment_df["Female Rate"] = pd.to_numeric(unemployment_df["Female Rate"], errors='coerce')
unemployment_df["Teen Rate"] = pd.to_numeric(unemployment_df["Teen Rate"], errors='coerce')
unemployment_df["White Rate"] = pd.to_numeric(unemployment_df["White Rate"], errors='coerce')
unemployment_df["Black Rate"] = pd.to_numeric(unemployment_df["Black Rate"], errors='coerce')
unemployment_df["Asian Rate"] = pd.to_numeric(unemployment_df["Asian Rate"], errors='coerce')
unemployment_df["Hispanic Rate"] = pd.to_numeric(unemployment_df["Hispanic Rate"], errors='coerce')

unemployment_df.dtypes

unemployment_df['Unemployment Change'] = unemployment_df['Total Rate'].pct_change().round(4)
unemployment_df.loc[0, 'Unemployment Change'] = 0

unemployment_df.head(10)

In [None]:
sp500_df.dtypes
sp500_df["Date"] = pd.to_datetime(sp500_df["Date"])
sp500_df.dtypes

sp500_df = sp500_df[(sp500_df["Day"] == 1) & (sp500_df["Date"] >= "2005-03-01")]
sp500_df = sp500_df.reset_index(drop = True)

sp500_df["Date"] = sp500_df["Date"].dt.strftime('%b %Y')
sp500_df = sp500_df.drop(columns=['Day', 'Weekday', 'Week', 'Month', 'Year'])

sp500_df["Close Change"] = sp500_df["Close"].pct_change().round(4)
sp500_df.loc[0, "Close Change"] = 0
sp500_df.head(10)

In [None]:
import pandas as pd

merged_data = pd.merge(sp500_df[['Date', 'Close']], unemployment_df, on='Date')

# Calculate univariate statistics for each demographic group and S&P 500 Closing Price
univariate_stats = {}

demographics = ['Total Rate', 'Male Rate', 'Female Rate', 'White Rate', 'Black Rate', 'Asian Rate', 'Hispanic Rate']
for demographic in demographics:
    stats = {
        'mean': merged_data[demographic].mean(),
        'median': merged_data[demographic].median(),
        'std_dev': merged_data[demographic].std(),
        'min': merged_data[demographic].min(),
        'max': merged_data[demographic].max(),
        'skewness': merged_data[demographic].skew(),
        'kurtosis': merged_data[demographic].kurt()
    }
    univariate_stats[demographic] = stats

# Univariate statistics for S&P 500 Closing Price
sp500_stats = {
    'mean': merged_data['Close'].mean(),
    'median': merged_data['Close'].median(),
    'std_dev': merged_data['Close'].std(),
    'min': merged_data['Close'].min(),
    'max': merged_data['Close'].max(),
    'skewness': merged_data['Close'].skew(),
    'kurtosis': merged_data['Close'].kurt()

univariate_stats['S&P 500'] = sp500_stats
univariate_stats


In [None]:
# Calculate Pearson correlation for each demographic with S&P 500 Closing Price
correlations = {}
for demographic in demographics:
    correlation = merged_data['Close'].corr(merged_data[demographic])
    correlations[demographic] = correlation

# Print correlations for each demographic
print("Correlation with S&P 500 Closing Price:")
for demographic, correlation in correlations.items():
    print(f"{demographic}: {correlation}")


In [None]:
import statsmodels.api as sm

# Run regression for each demographic
regression_results = {}
for demographic in demographics:
    X = merged_data[demographic]
    X = sm.add_constant(X)  # Add a constant term for the intercept
    y = merged_data['Close']
    
    model = sm.OLS(y, X).fit()
    
    regression_results[demographic] = {
        'coefficients': model.params,
        'p_values': model.pvalues,
        'r_squared': model.rsquared
    }

# Display regression results
print("Regression Results:")
for demographic, results in regression_results.items():
    print(f"\n{demographic}:")
    print(f"  Coefficients: {results['coefficients']}")
    print(f"  p-values: {results['p_values']}")
    print(f"  R-squared: {results['r_squared']}")


In [None]:
# Hypothesis test for unemployment in Men

import pandas as pd
from scipy.stats import ttest_ind

# Merging unemployment and SP500 data on Date/Month
merged_df = pd.merge(unemployment_df, sp500_df, on='Date')

unemployement = merged_df["Male Rate"]
sp500_high = merged_df["High"]

#  Welch's t-test
t_stat, p_value = ttest_ind(unemployement, sp500_high, equal_var=False)

# Displaying the results
print(f"T-Statistic: {t_stat:.4f}")
print(f"P-Value: {p_value:.4f}")

# Interpretation
alpha = 0.05
if p_value < alpha:
    print("Reject H0: The average values are significantly different between unemployment and SP500 Highs.")
else:
    print("Fail to reject H0: No significant difference found.")


In [None]:
# First Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,6))
sns.regplot(x=merged_df["Male Rate"], y=merged_df["High"], ci=None, scatter_kws={"color": "pink"}, line_kws={"color": "red"})
plt.title("Relationship between Male Unemployment Rate and S&P 500 High")
plt.xlabel("Male Unemployment Rate (%)")
plt.ylabel("S&P 500 High Price")
plt.grid(True)
plt.show()

In [None]:
# Hypothesis test for unemployment in women
from scipy.stats import pearsonr

# Female unemployment rate vs SP500 High
corr_coeff, p_value = pearsonr(merged_df["Female Rate"], merged_df["High"])

print(f"Correlation Coefficient: {corr_coeff:.4f}")
print(f"P-Value: {p_value:.4f}")

alpha = 0.05
if p_value < alpha:
    print("Reject H0: Significant correlation exists.")
else:
    print("Fail to reject H0: No significant correlation.")

In [None]:
# Second visualization
fig, ax1 = plt.subplots(figsize=(12,6))

# Plot Unemployment (Female Rate)
color = 'tab:blue'
ax1.set_xlabel('Date')
ax1.set_ylabel('Male Unemployment Rate (%)', color=color)
ax1.plot(merged_df["Date"], merged_df["Female Rate"], color=color, label="Female Unemployment Rate")
ax1.tick_params(axis='y', labelcolor=color)

# Create second y-axis for SP500
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('S&P 500 High Price', color=color)
ax2.plot(merged_df["Date"], merged_df["High"], color=color, label="S&P 500 High")
ax2.tick_params(axis='y', labelcolor=color)

plt.title('Female Unemployment Rate and S&P 500 High Over Time')
fig.tight_layout()
plt.show()

In [None]:
# Create the binary target: 1 if S&P 500 Close increased, 0 if decreased
merged_df["SP500_Up"] = (merged_df["Close Change"] > 0).astype(int)

In [None]:
import statsmodels.api as sm

# Defining features and target
features = merged_df[["Male Rate", "Female Rate", "Teen Rate", "White Rate", "Black Rate", "Asian Rate", "Hispanic Rate"]]
target = merged_df["SP500_Up"]


features = sm.add_constant(features)

# Fitting logistic regression
logit_model = sm.Logit(target, features)
result = logit_model.fit()

# Print summary
print(result.summary())