# <b>CSV to Database ETL

In [35]:
import pandas as pd
import sqlite3
from sklearn.preprocessing import StandardScaler
import matplotlib as plt

## <b>Extract Data

In [36]:
# Function to read data from the CSV file
def extract_data(file_path):
    df = pd.read_csv(file_path)
    return df

## <b>Transform Data

In [37]:
# Function to clean and preprocess the data
def transform_data(df):

    # Handle missing values
    if 'release_date' in df.columns:
        df['release_date'] = df['release_date'].fillna(pd.to_datetime('1970-01-01'))  # Replace missing dates with a default date

    # Convert release_date to datetime and extract year, month, day
    if 'release_date' in df.columns:
        df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
        df['release_year'] = df['release_date'].dt.year
        df['release_month'] = df['release_date'].dt.month
        df['release_day'] = df['release_date'].dt.day

    # Normalize numerical columns (popularity, vote_average, vote_count)
    scaler = StandardScaler()
    numerical_columns = ['popularity', 'vote_average', 'vote_count']
    for col in numerical_columns:
        if col in df.columns:
            df.fillna({'df[col]': 'mean'}, inplace=True)  # Replace missing values with the mean
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    # Ensure data types are consistent
    dtype_mapping = {
        'title': 'string',
        'overview': 'string',
        'release_year': 'int',
        'release_month': 'int',
        'release_day': 'int',
        'popularity': 'float',
        'vote_average': 'float',
        'vote_count': 'float'
    }
    for col, dtype in dtype_mapping.items():
        if col in df.columns:
            df[col] = df[col].astype(dtype)

    return df

## <b> Load Data

In [38]:
# Function to load data into SQLite database
def load_data(df, db_name, table_name):
    conn = sqlite3.connect(db_name)
    df.to_sql(table_name, conn, if_exists='replace', index=False)
    conn.close()

## <b>Main ETL

In [39]:
def etl(file_path, db_name, table_name):
    data = extract_data(file_path)
    transformed_data = transform_data(data)
    load_data(transformed_data, db_name, table_name)

## <b>Example Usage

In [40]:
# Example usage
file_path = 'CSV_Data/movies.csv'  # Path to the uploaded CSV file
db_name = 'movies.db'  # Name of the SQLite database
table_name = 'imdb_movies_list'  # Name of the table to create

etl(file_path, db_name, table_name)