## Import Packages and Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
from matplotlib import pyplot as plt
from statsmodels.formula.api import ols
import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import pearsonr
from scipy.stats import ttest_rel
from sklearn.model_selection import train_test_split

## (P)ACE - Plan Phase
### Part 1 - Data
#### How were the observations in the dataset collected?
###### (What type of data collection method was used? Clustering, Random or Stratified sampling.)

### Part 2 - Research Question
#### Project Scope
###### (If the scope isn't already known, insights into the data will be discovered during EDA phase has begun.)

## P(A)CE - Analyze Phase
### Part 3 - EDA (Exploratory Data Analysis)
#### Investigate the Dataset

In [None]:
# Import the dataset
df = pd.read_csv('')

##### Descriptive Statistics

In [None]:
# Summary Statistics
df.describe()

In [None]:
# Check column data types
df.dtypes

In [None]:
# Check for duplicates
df.shape
df.drop_duplicates().shape

In [None]:
# Check for missing values
df_missing_values = df.isnull().sum()
print(df_missing_values)

In [None]:
# Remove, impute or acquire missing data

df_filtered = df.dropna()
df_filtered.shape
df_filtered.isnull().sum()

##### Pairplot to visualize relationships between pairs of variables


In [None]:
sns.pairplot(df)

##### What Probability Distributions could apply to this dataset? (If Applicable)
###### (Binomial, Poisson, Normal Distribution)

#### Data Manipulation and Cleaning

In [None]:
# Rename columns for easier analysis if necessary

df_filtered = df_filtered.rename(columns={})
df_filtered.reset_index
df_filtered.head()

In [None]:
# Convert date columns if necessary

df_filtered[] = pd.to_datetime(df_filtered[])
df_filtered[] = pd.to_datetime(df_filtered[])
df_filtered.head()

In [None]:
# Keep relevant columns

columns_to_drop = []
df_filtered = df_filtered.drop(columns_to_drop, axis=1)  # Drop specified columns
df_filtered.head()

In [None]:
# Plot the Dependent Variable

plt.figure(figsize=(8, 5))
plt.hist(df_filtered[], bins=10, edgecolor="black")
plt.xlabel()
plt.ylabel()
plt.title()
plt.grid(True)


##### Initial Insights

#### Check Statistical Test Conditions/Assumptions (If Applicable)

In [None]:
## Boxplots for Averages

In [None]:
## Histograms for Averages

In [None]:
## Normal Probability Plots

## PA(C)E - Construct Phase of the PACE Framework
### Part 4 - Modeling/Inference 

H0 (nothing going on):

HA (something going on):

#### Create a Hold-Out Sample (If Applicable)

In [None]:
# Subset X and y variables
df_filtered_X = df_filtered[[]]
df_filtered_y = df_filtered[[]]

# Import train-test-split function from sci-kit learn
from sklearn.model_selection import train_test_split

# Create training data sets and holdout (testing) data sets
X_train, X_test, y_train, y_test = train_test_split(df_filtered_X, df_filtered_y, 
                                                    test_size = 0.3, random_state = 42)

#### Build and Fit The Model (If Applicable)

In [None]:
# Define the OLS formula.
ols_data = pd.concat([X_train, y_train], axis = 1)

# Create an OLS model.
ols_formula =

# Fit the model.
OLS = ols(formula = ols_formula, data = ols_data)

# Save the results summary.
model = OLS.fit()

# Display the model results.
model.summary()

#### Run Statistical Tests (If Applicable)

#### Check Model Conditions/Assumptions Post Model Construction (If Applicable)

##### Linearity

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create the scatter plot with regression line
sns.regplot(x="", y="", data=ols_data)

# Access the scatter plot object and set outlines
plt.gca().collections[0].set_edgecolor("black")  # Set outline color to black
plt.gca().collections[0].set_linewidth(1)  # Adjust outline width

# Optional: Add labels and title for clarity
plt.xlabel("")
plt.ylabel("")
plt.title("")

plt.show()


##### Independent Observations

##### Normality

In [None]:
# Calculate the residuals.

# Create a histogram with the residuals .
residuals = model.resid
fig, axes = plt.subplots(1, 2, figsize = (8,4))
sns.histplot(residuals, ax=axes[0])
axes[0].set_xlabel("Residual Value")
axes[0].set_title("Histogram of Residuals")

# Create a Q-Q plot of the residuals.
sm.qqplot(model.resid, line = 's',ax = axes[1])
axes[1].set_title("Normal Q-Q Plot")
plt.tight_layout()

##### Homoscedasticity

In [None]:
# Create a scatterplot with the fitted values from the model and the residuals.
fig = sns.scatterplot(x=model.fittedvalues, y=model.resid)
fig.set_xlabel("Fitted Values")
fig.set_ylabel("Residuals")

# Set the title.
fig.set_title("Fitted Values vs Residuals")

# Add a line at y = 0 to visualize the variance of residuals above and below 0.
fig.axhline(0)

##### No Multicollinearity (If Applicable)

In [None]:
# Test multicollinearity of predictor variables with Variance Inflation Factor (VIF)

from statsmodels.stats.outliers_influence import variance_inflation_factor

X = df_filtered[[]]

vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif = zip(X, vif)
print(list(vif))

Featres with an VIF > 5 may have multicollinearity, which means they are highly correlated with each other and can inflate the standard errors of the regression coefficients. This can make it difficult to interpret the results of the model and can also lead to unstable coefficient estimates.

Consideration should be given to removing these features from the model or finding ways to reduce the multicollinearity between them.

### Part 5 - Prediction/Conclusion

In [None]:
model.summary()

## PAC(E)- Execute Phase of the PACE Framework
### What Story does the Data Tell?