In [None]:
import pandas as pd
from scipy.stats import skew
import matplotlib.pyplot as plt
import seaborn as sns

stock_data = pd.read_csv('data/stock_data.csv')
unemployment = pd.read_csv('data/SeriesReport.csv')
#print('stock_data (head):')
#print(stock_data.head())   

# Converting the Observation Date Variable to a Datetime Variable
stock_data['dt'] = pd.to_datetime(stock_data['dt'])

# Unpivoting the Unemployment Data
unemployment_unpivot = unemployment.melt(id_vars='Year', var_name='Month', value_name='Unemployment Percent')

# Extracting the Year and Month from the Observation Date
stock_data['Year'] = stock_data['dt'].dt.year
stock_data['Month'] = stock_data['dt'].dt.month

# Replacing the Month Words with Month Numbers
month_replacement = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}

# Apply the mapping
unemployment_unpivot['Month'] = unemployment_unpivot['Month'].map(month_replacement)

# Merging the two DataFrames Together
stock_data_final = pd.merge(stock_data, unemployment_unpivot, on = ['Year', 'Month'], how = 'left')

# Returning the First Five Records
stock_data_final.head()

categorical_cols = stock_data_final.select_dtypes(include=['object']).columns.tolist()
numerical_cols = stock_data_final.select_dtypes(include=['int64', 'float64']).columns.tolist()

#print("Categorical columns:", categorical_cols)
#print("Numerical columns:", numerical_cols)

# Calculating Skewness for Numerical Columns
numeric_data=stock_data_final[numerical_cols]

skew_array = skew(numeric_data, axis=0, bias=False, nan_policy='omit')
skew_value = pd.Series(skew_array, index=numerical_cols)

print("\nSkewness (scipy) for numeric columns:")
print(skew_value)

for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    plt.hist(stock_data_final[col].dropna(), bins=30)
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()

#Correlation
corr_matrix = stock_data_final[numerical_cols].corr()
print("\nCorrelation Matrix:")
print(corr_matrix)

#sp500 & djia, sp500_volume & djia_volume, are highly correlated (>0.9). 