# Statistical Analysis and Hypothesis Testing

In [1]:
import pandas as pd
from prophet import Prophet
import matplotlib.pyplot as plt 

# Load the merged dataset
file_path = '/Users/mishka/Documents/GitHub/Final-Project/Clean Data/merged_data.csv'  # Update the path to where your file is located
merged_data = pd.read_csv(file_path)

# Display the first few rows to ensure it's loaded correctly
print(merged_data.head())

  from .autonotebook import tqdm as notebook_tqdm


         Date    OIL    GLD     SLV
0  2008-01-02  97.01  84.86  15.180
1  2008-01-03  98.45  85.57  15.285
2  2008-01-04  96.87  85.13  15.167
3  2008-01-07  94.19  84.77  15.053
4  2008-01-08  96.37  86.78  15.590


In [15]:
from scipy.stats import pearsonr

# Calculate the Pearson correlation coefficient between Gold and Oil prices
corr_gold_oil, p_value_gold_oil = pearsonr(merged_data['GLD'], merged_data['OIL'])
corr_gold_silver, p_value_gold_silver = pearsonr(merged_data['GLD'], merged_data['SLV'])

print(f"Correlation between Gold and Oil: {corr_gold_oil} (p-value: {p_value_gold_oil})")
print(f"Correlation between Gold and Silver: {corr_gold_silver} (p-value: {p_value_gold_silver})")

Correlation between Gold and Oil: 0.528654262621428 (p-value: 9.96652676612479e-162)
Correlation between Gold and Silver: 0.8679038361872119 (p-value: 0.0)


### 1.Hypothesis on the volatility of gold and oil:

***Hypothesis:*** Gold prices tend to rise when oil price volatility is high, making gold an effective hedge against uncertainty in commodity markets.


***Explanation:*** In times of high volatility on the oil markets, investors may seek safety in gold, which increases the price in such times.

In [11]:
import pandas as pd
from scipy.stats import pearsonr

# Berechnung der täglichen prozentualen Veränderungen als Maß für die Volatilität
merged_data['OIL_volatility'] = merged_data['OIL'].pct_change().abs()

# Entfernen von Zeilen, die NaN-Werte in einer der relevanten Spalten enthalten
valid_data = merged_data[['OIL_volatility', 'GLD']].dropna()

# Berechnung der Korrelation zwischen der Volatilität der Ölpreise und den Goldpreisen
corr_oil_volatility_gold, p_value_oil_volatility_gold = pearsonr(valid_data['OIL_volatility'], valid_data['GLD'])

print(f"Correlation between the volatility of oil prices and the gold price: {corr_oil_volatility_gold}, P-Wert: {p_value_oil_volatility_gold}")


Correlation between the volatility of oil prices and the gold price: -0.2641192956683762, P-Wert: 4.259460422972695e-37


***Results:*** The correlation between oil price volatility and gold prices is -0.2641 with a very low P-value (4.259460422972695e-37).

 This indicates a negative correlation, which could suggest that gold does not necessarily increase in value during periods of high oil price volatility.

***Conclusion:*** The hypothesis is not fully supported as the negative correlation suggests that gold may not be suitable as a hedge against market volatility in all cases.

### 2. Hypothesis on the relationship between commodity prices:

***Hypothesis:*** Sharp rises in oil prices have a negative impact on the prices of other commodities such as gold and silver.


***Explanation:*** If oil prices rise, the prices of other commodities such as gold and silver could also be affected, either through direct market dynamics or through shifted investments.

In [12]:
from scipy.stats import linregress

# Linear regression: Examine the relationship between oil and gold prices
slope_gold, intercept_gold, r_value_gold, p_value_gold, std_err_gold = linregress(merged_data['OIL'], merged_data['GLD'])
slope_silver, intercept_silver, r_value_silver, p_value_silver, std_err_silver = linregress(merged_data['OIL'], merged_data['SLV'])

print(f"Oil and Gold - Slope: {slope_gold}, P-Value: {p_value_gold}, R-squared: {r_value_gold**2}")
print(f"Oil and Silver - Slope: {slope_silver}, P-Value: {p_value_silver}, R-squared: {r_value_silver**2}")

Oil and Gold - Slope: 0.4412514652246467, P-Value: 9.966526766126306e-162, R-squared: 0.2794753293878058
Oil and Silver - Slope: 0.18069815409612916, P-Value: 0.0, R-squared: 0.5083095274982847


***Results:*** The slope between oil and gold is 0.4413 with a P-value of 9.966526766126306e-162 and an R-squared of 0.2795. 

The slope between oil and silver is 0.1807 with a P-value of 0.0 and an R-squared of 0.5083. Both relationships are significant, with the relationship with silver being stronger.


***Conclusion:*** These results partially confirm the hypothesis, particularly in relation to silver, which shows a stronger correlation with oil prices. 

It is possible that rising oil prices also have an impact on other markets.

### 3. Hypothesis for predicting future price movements:

***Hypothesis:*** Increases in the prices of commodities such as oil and gold are leading indicators of future increases in other commodities, particularly silver.


***Explanation:*** Rising oil and gold prices could predict future price changes in silver, as these commodities are often traded in similar market cycles.

In [14]:
import pandas as pd
from scipy.stats import pearsonr

# Time lag analysis: Shift the oil and gold prices by 1 month
merged_data['OIL_shifted'] = merged_data['OIL'].shift(30)  # 30-day lag
merged_data['GLD_shifted'] = merged_data['GLD'].shift(30)

# Drop rows with NaN values in any of the relevant columns to ensure they have the same length
aligned_data = merged_data.dropna(subset=['OIL_shifted', 'GLD_shifted', 'SLV'])

# Correlation of shifted oil and gold prices with current silver prices
corr_oil_silver_lagged, p_value_oil_silver_lagged = pearsonr(aligned_data['OIL_shifted'], aligned_data['SLV'])
corr_gold_silver_lagged, p_value_gold_silver_lagged = pearsonr(aligned_data['GLD_shifted'], aligned_data['SLV'])

print(f"Correlation between lagged oil prices and silver prices: {corr_oil_silver_lagged}, P-Value: {p_value_oil_silver_lagged}")
print(f"Correlation between lagged gold prices and silver prices: {corr_gold_silver_lagged}, P-Value: {p_value_gold_silver_lagged}")


Correlation between lagged oil prices and silver prices: 0.6375447906479285, P-Value: 9.355098691545608e-253
Correlation between lagged gold prices and silver prices: 0.8004808712104129, P-Value: 0.0


***Results:*** The correlation between lagged oil prices and silver prices is 0.6375 with a very low P-value (9.355098691545608e-253). The correlation between lagged gold prices and silver prices is 0.8005 with a P-value of 0.0. These strong positive correlations suggest that price changes in gold and oil may well serve as leading indicators.


***Conclusion:*** These results strongly support the hypothesis. 

In particular, the high correlation between lagged gold prices and current silver prices suggests that changes in commodity prices can be used as indicators of future market developments.

# Multivariate Regression:

In [4]:
import statsmodels.api as sm
import pandas as pd

# Load the merged dataset
file_path = '/Users/mishka/Documents/GitHub/Final-Project/Clean Data/merged_data.csv' 
merged_data = pd.read_csv(file_path)

# Preparing the independent variables (Oil and Gold) and dependent variable (Silver)
X = merged_data[['OIL', 'GLD']]
y = merged_data['SLV']

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the regression
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                    SLV   R-squared:                       0.843
Model:                            OLS   Adj. R-squared:                  0.843
Method:                 Least Squares   F-statistic:                     6009.
Date:                Thu, 22 Aug 2024   Prob (F-statistic):               0.00
Time:                        22:46:52   Log-Likelihood:                -5492.2
No. Observations:                2243   AIC:                         1.099e+04
Df Residuals:                    2240   BIC:                         1.101e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -12.5509      0.318    -39.517      0.0