# What is the relationship between the opening price and volume concerning the closing price of a stock on a daily basis?

## Dataset- Big Tech Stock Prices

By- Likith Kumar Dundigalla

#### To ignore warnings 

In [263]:
import warnings
warnings.filterwarnings("ignore")  

#### Import all required libraries

In [264]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import KFold
import statsmodels.api as sm
from sklearn.utils import resample
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

### Data selection and exploration & Data preprocessing

#### Loading dataset into dataframe through Url

In [None]:
data=pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_stock_prices.csv')

In [None]:
#summary information about a dataset 
data.info()

In [None]:
#statistics such as count, mean, standard deviation, minimum, maximum, and quartile values for each numerical column
data.describe()

In [None]:
#identify missing or null values within a DataFrame 
data.isnull().sum()

In [None]:
#First 5 records of data
data.head()

In [None]:
#Boxplot of data
sns.boxplot(data)

In [None]:
#pairwise relationships between numeric columns in the dataset.
sns.pairplot(data,kind='scatter')

In [None]:
# Correlation between each column in dataset
correlation_matrix = data.corr()
correlation_matrix

#### Declaring the features and the target values and seperating into train and test data

In [None]:
X=data[['open','volume']]
y=data['close']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Resampling 
# Example: Resampling for imbalanced data using bootstrap
X_train_resampled, y_train_resampled = resample(X_train, y_train, replace=True, random_state=42)

### Ordinary Least Squares(OLS) Regression

In [None]:
# Model Building with statsmodels
X_train_sm = sm.add_constant(X_train_resampled)  

# Adding a constant for the intercept
ols_model = sm.OLS(y_train_resampled, X_train_sm).fit()

# Evaluate Model Performance
# Adding a constant for the intercept in test data (statsmodels)
X_test_sm = sm.add_constant(X_test) 

# Predict using statsmodels model
y_pred_sm = ols_model.predict(X_test_sm)

# Model Diagnostics
# Print model summary with regression diagnostics
ols_model.summary()  

### Inferences:

- The model seems to fit the data exceptionally well, with a very high R-squared value (close to 1), suggesting that the independent variables 'open' and 'volume' explain a significant portion of the variance in the 'close' variable.
- Both 'open' and 'volume' have coefficients suggesting a positive relationship with the 'close' variable. However, 'volume' has a p-value of 0.038, indicating it might not be as statistically significant as 'open'.
- The diagnostic metrics provide insights into potential issues like multicollinearity or deviation from normality assumptions in the model's residuals.
Overall, it seems like a highly predictive model with strong explanatory power, but it's essential to consider the significance of each variable and the model's assumptions.

### Assumption Checks

In [None]:
residuals = y_test - y_pred_sm
# Independence: Use Durbin-Watson test for autocorrelation in residuals
dw_test = sm.stats.stattools.durbin_watson(residuals)
print(f"Durbin-Watson test statistic: {dw_test}")
# Values around 2 indicate no autocorrelation

# Homoscedasticity: Check for constant variance of residuals
# Use a scatter plot of residuals against fitted values
plt.scatter(y_pred_sm, residuals)
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted values')
plt.show()

The plot indicated the auto-correlation with respect to residuals and fitted values in model. 

There is no auto-correlation present in the model

In [None]:
sm.qqplot(residuals, line='s')
plt.title('Q-Q plot of residuals')
plt.show()

The Q-Q plot indicates that the statistical model is not a good fit for the data. The residuals are not normally distributed, which suggests that the model may not be capturing all of the important relationships in the data.

In [None]:
OLS_model_R_squared = r2_score(y_test, y_pred_sm)
OLS_model_RMSE = mean_squared_error(y_test, y_pred_sm, squared=False)
OLS_model_MAE = mean_absolute_error(y_test, y_pred_sm)

#### Ordinary Least Squares(OLS) Regression with Cross Validation

In [None]:
# Number of folds for cross-validation
num_folds = 5  # You can adjust this as needed

# Initialize KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Lists to store evaluation metrics across folds
r_squared_values = []
mae_values = []

for train_index, val_index in kf.split(X_train_resampled):
    # Split data into train and validation sets for this fold
    X_train_fold, X_val_fold = X_train_resampled.iloc[train_index], X_train_resampled.iloc[val_index]
    y_train_fold, y_val_fold = y_train_resampled.iloc[train_index], y_train_resampled.iloc[val_index]

    # Fit the OLS model
    ols_model_fold = sm.OLS(y_train_fold, sm.add_constant(X_train_fold)).fit()

    # Make predictions on the validation set
    y_pred_val = ols_model_fold.predict(sm.add_constant(X_val_fold))

    # Calculate R-squared for this fold
    r_squared_fold = 1 - (np.sum((y_val_fold - y_pred_val) ** 2) / np.sum((y_val_fold - np.mean(y_val_fold)) ** 2))
    r_squared_values.append(r_squared_fold)

    # Calculate MAE for this fold
    mae_fold = np.mean(np.abs(y_val_fold - y_pred_val))
    mae_values.append(mae_fold)

# Calculate average metrics across all folds
avg_r_squared = np.mean(r_squared_values)
avg_mae = np.mean(mae_values)

# Print average metrics
print("Average R-squared across folds:", avg_r_squared)
print("Average MAE across folds:", avg_mae)

In [None]:
ols_model_fold.summary()

### Comparision 

Both models have extremely high R-squared values indicating a very good fit to the data.
The F-statistics are very high in both cases, indicating that the overall model is statistically significant.
'open' variable appears to be highly significant in predicting 'close' in both models (p-value = 0.000), indicating its strong impact.
The 'volume' variable, however, shows a different picture:
In Summary 1, 'volume' seems to be statistically significant (p-value = 0.038).
In Summary 2, 'volume' is not statistically significant (p-value = 0.134).
The change in the significance of the 'volume' variable between the two models might be due to the difference in the datasets (different number of observations) or changes in the data's characteristics over time, leading to variations in its impact on predicting the 'close' variable.

### Linear Regression
Linear regression is a statistical method used to model the relationship between a dependent variable (often denoted as 'y') and one or more independent variables (often denoted as 'x'). The objective is to find a linear relationship that best describes the data.

In a simple linear regression, there's one independent variable ('x') used to predict the dependent variable ('y'). The relationship between the two variables is represented by a straight line:
y=mx+c

The linear regression model estimates the values of m and c based on the given data in such a way that the line best fits the data points. This fitting is often done by minimizing the sum of squared differences between the observed and predicted values (Ordinary Least Squares method).


In [None]:
# Create linear regression object
lr_model = LinearRegression()
lr_model.fit(X_train_resampled, y_train_resampled)

X_test_lr = X_test  # No need to add constant for scikit-learn model

# Predict using scikit-learn model
y_pred_lr = lr_model.predict(X_test_lr)


Linear_model_R_squared = r2_score(y_test, y_pred_lr)
Linear_model_RMSE=mean_squared_error(y_test, y_pred_lr, squared=False)
Linear_model_MAE = mean_absolute_error(y_test, y_pred_lr)


### Random Forest Regressor

A Random Forest Regressor is a machine learning algorithm used for regression tasks. It's an ensemble learning method that operates by constructing multiple decision trees during training and outputting the average prediction of the individual trees for regression problems.

In [None]:
# Creating and fitting the Random Forest Regression model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust hyperparameters
rf_model.fit(X_train, y_train)

# Predicting on the test set
y_pred_rf = rf_model.predict(X_test)


Random_Forest_model_R_squared = r2_score(y_test, y_pred_rf)
Random_Forest_model_RMSE = mean_squared_error(y_test, y_pred_rf, squared=False)
Random_Forest_modelMAE = mean_absolute_error(y_test, y_pred_rf)

### Prediction Plots of each model

In [None]:
# Plotting for OLS model
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_sm, color='blue')
plt.title('OLS model: Actual vs Predicted')
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.show()

# Plotting for Linear Regression Model
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_lr, color='red')
plt.title('Linear Regression Model: Actual vs Predicted')
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.show()

# Plotting for Random Forest Model
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_rf, color='red')
plt.title('Random Forest Model: Actual vs Predicted')
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.show()


The graphs show the predicted and actual values of the 3 models
and they seem to be correctly predicted.

### Performance Metrics

In [None]:
df = pd.DataFrame({
    "Model": ["OLS", "Linear", "Random Forest"],
    "R-squared": [OLS_model_R_squared,Linear_model_R_squared, Random_Forest_model_R_squared],
    "RMSE": [OLS_model_RMSE, Linear_model_RMSE, Random_Forest_model_RMSE],
    "MAE": [OLS_model_MAE, Linear_model_MAE, Random_Forest_modelMAE]
})

# Print the DataFrame
print(df.to_string())


### Conclusion:
Both the OLS and Linear models perform very similarly and seem to outperform the Random Forest model based on these metrics.
It's challenging to decisively say which model is definitively better without further context or specific requirements.
However, based solely on these evaluation metrics (R-squared, RMSE, and MAE), the OLS and Linear models appear to have slightly better predictive performance compared to the Random Forest model.