# Introduction

**Overview:** Brief description of the problem, the dataset, and the main objectives of the project.

# Setup 

## Imports

In [None]:
import pandas as pd
import numpy as np

## Environment Variables 
**Note**: Setting environment variables is optional, but it is recommended if you store sensitive information (such as API keys or database credentials) in a `.env` file. Using environment variables helps keep such information secure and separate from your codebase.

In [None]:
# Imports
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get API key from .env 
api_key = os.getenv("api_key")

# Data Loading

In [None]:
# Read data from csv file to pandas DataFrame
df = pd.read_csv("")

# Data Quality Checks

In [None]:
# Show DataFrame info
df.info()

In [None]:
# Show top five rows
df.head()

# Data Preprocessing

## Handling Duplicates

In [None]:
# Diagnose duplicates
df.duplicated().value_counts()

In [None]:
# Remove duplicates
df = df.drop_duplicates().copy()

## Data Type Conversion

In [None]:
# Convert column from str to int
df["int_column"] = df["str_column"].astype("Int32")

## Handling Missing Values

### Continuous Column

In [None]:
# Descriptive statistics of continuous column
df["continuous_column"].describe()

In [None]:
# Impute missing values with the median
median = df["continuous_column"].median()
df["continuous_column"] = df["continuous_column"].fillna(median)

### Categorical Column

In [None]:
# Frequencies of categorical column
df["categorical_column"].value_counts()

In [None]:
# Impute missing values with the mode 
mode = df["categorical_column"].mode()[0]
df["categorical_column"] = df["categorical_column"].fillna(mode)

# Exploratory Data Analysis (EDA)

# Train-Validation-Test Split

# Model Training