In [1]:
# Data manipulation packages
import pandas as pd  # Pandas provides data structures for efficiently storing large datasets and tools for data analysis.
import numpy as np   # NumPy is used for numerical operations and array manipulations.

# Data Visualization packages
import matplotlib.pyplot as plt  # Matplotlib is a 2D plotting library for creating static, animated, and interactive visualizations in Python.
import seaborn as sns  # Seaborn is a statistical data visualization library based on Matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.

# Machine learning Packages
from sklearn.pipeline import Pipeline  
from sklearn.compose import ColumnTransformer  # ColumnTransformer allows applying different transformers to different columns in a dataset.
from sklearn.impute import SimpleImputer  # SimpleImputer is used for handling missing data by imputing missing values with specified strategies.
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler  # These classes provide different methods for scaling/normalizing numerical features.
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder  # These classes handle categorical feature encoding (one-hot, label, and ordinal encoding).
from sklearn.preprocessing import FunctionTransformer  # FunctionTransformer allows applying custom functions to transform data.
from sklearn.tree import DecisionTreeClassifier  # DecisionTreeClassifier is an implementation of a decision tree classifier.
from sklearn.metrics import accuracy_score, classification_report  # These metrics are used for evaluating classification model performance.
from sklearn.model_selection import train_test_split  # train_test_split is used to split a dataset into training and testing sets.
from sklearn import set_config  # set_config allows configuring global scikit-learn behavior.
from sklearn.ensemble import RandomForestClassifier  # RandomForestClassifier is an ensemble learning method based on decision trees.
from sklearn.svm import SVC  # Support Vector Classifier (SVC) is a classifier that uses support vector machines for classification.
from sklearn.preprocessing import PowerTransformer  # PowerTransformer applies power transformations to make data more Gaussian-like.
from sklearn.naive_bayes import GaussianNB  # Gaussian Naive Bayes is a probabilistic classifier based on the Gaussian distribution.
from sklearn.ensemble import GradientBoostingClassifier  # GradientBoostingClassifier is an ensemble method that builds a sequence of weak learners (trees).
from scipy.stats import pearsonr  # Pearson correlation coefficient measures the linear relationship between two variables.
from sklearn.model_selection import cross_val_score  # cross_val_score is used for cross-validated model performance evaluation.
from imblearn.under_sampling import RandomUnderSampler  # RandomUnderSampler is used for under-sampling to address class imbalance.
from imblearn.over_sampling import RandomOverSampler, SMOTE  # RandomOverSampler and SMOTE are used for over-sampling to address class imbalance.
from sklearn.feature_selection import SelectKBest, mutual_info_classif  # SelectKBest performs feature selection based on scoring functions like mutual information.
from imblearn.over_sampling import SMOTE  # SMOTE is a technique for generating synthetic samples to address class imbalance.
from sklearn.datasets import make_classification  # make_classification generates a synthetic dataset for classification.
from sklearn.metrics import roc_curve, roc_auc_score  # roc_curve and roc_auc_score are used for Receiver Operating Characteristic (ROC) curve analysis.
from sklearn.metrics import confusion_matrix  # confusion_matrix calculates the confusion matrix for classification models.
from sklearn.model_selection import GridSearchCV  # GridSearchCV performs hyperparameter tuning using grid search.

# Database connection package
import pyodbc  # PyODBC is a Python module that makes accessing ODBC databases simple.
from dotenv import dotenv_values  # dotenv loads environment variables from a .env file.

# Ignore warnings (optional)
import warnings  # The warnings module provides a way to handle warnings in Python.
warnings.filterwarnings("ignore")

In [2]:
# Load environment variables from .env file into a dictionary
environment_variables = dotenv_values('.env')

# Get the values for the credentials you set in the '.env' file
server = environment_variables.get("server_name")
database = environment_variables.get("database_name")
username = environment_variables.get("username")
password = environment_variables.get("password")

In [3]:
# Create a connection string
connection_string = f"DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password};MARS_Connection=yes;MinP"

In [4]:
# Use the connect method of the pyodbc library and pass in the connection string.
# This will connect to the server 


connection = pyodbc.connect(connection_string)

In [5]:
# Now the sql query to get the data is what what you see below. 
# Note that you will not have permissions to insert delete or update this database table. 

query = "SELECT * FROM dbo.oil"

oil_table = pd.read_sql(query, connection)

#Holiday_events table 

query = "SELECT * FROM dbo.holidays_events"

holiday_events = pd.read_sql(query, connection)

#Stores Table

query = "SELECT * FROM dbo.stores"

stores_table = pd.read_sql(query, connection)

In [6]:
# Load the third dataset (it's a CSV file named 'test', 'transaction', 'sample_submission')

test_data = pd.read_csv('test.csv')

transaction_data = pd.read_csv('transactions.csv')

sample_data = pd.read_csv('sample_submission.csv')