In [13]:
# Import libraries
import pandas as pd
import sys
import os

In [14]:
# Get the current working directory
current_dir = os.getcwd()
# Move up one level from the current directory
parent_dir = os.path.dirname(current_dir)
# Change directory into data directory
data_dir = os.path.join(parent_dir, 'data')

In [15]:
sys.path.append(parent_dir)

In [16]:
import db_config as cfg

# 1. Extract

In [17]:
df_product = pd.read_csv(os.path.join(data_dir, 'products.csv'), index_col=None)

In [18]:
df_product.head()

Unnamed: 0,item_desc,corp_item_brand_name,pim_item_class_desc,pim_item_sub_class_desc,state,flavor,pim_tasting_notes,alcohol_percentage,sweetness_level,bitterness_level,...,body,serving_temperature,vintage_year,grape_variety,region,price,food_pairing,aroma,mouthfeel,finish
0,-196 CKTL VOD A(DL/G/P) CAN 3/8PK,-196,COCKTAILS,COCKTAILS-OTHER,TX,OTHER,,12.437938,8,10,...,Light,47.499006,2019,Chardonnay,Sonoma,76.968522,Pasta,Fruity,Silky,Short
1,-196 CKTL VOD PEACH 12 CAN 6/4PK,-196,COCKTAILS,COCKTAILS-OTHER,TX,PEACH,,12.995617,8,3,...,Light,43.392326,2005,Chardonnay,Sonoma,21.302044,Grilled Chicken,Fruity,Silky,Long
2,-196 CKTL VOD DBL LEM 12 CAN 6/4PK,-196,COCKTAILS,COCKTAILS-OTHER,TX,LEMON,Fresh lemon peel. Tart and light sweet with ch...,11.23442,7,6,...,Medium,57.216822,2017,Merlot,Sonoma,23.523335,Cheese,Floral,Velvety,Long
3,-196 CKTL VOD GRFRUIT 12 CAN 6/4P,-196,COCKTAILS,COCKTAILS-OTHER,TX,GRAPEFRUIT,,5.381421,4,4,...,Full,47.878084,2007,Cabernet Sauvignon,Tuscany,67.868299,Pasta,Fruity,Velvety,Short
4,10 CANE RUM 80,10 CANE,RUM,GOLD RUM,TX,,,13.381235,7,4,...,Full,51.834639,2008,Cabernet Sauvignon,Tuscany,95.581735,Pasta,Fruity,Silky,Short


# 2. Transform

In [19]:
df_product.dtypes

item_desc                   object
corp_item_brand_name        object
pim_item_class_desc         object
pim_item_sub_class_desc     object
state                       object
flavor                      object
pim_tasting_notes           object
alcohol_percentage         float64
sweetness_level              int64
bitterness_level             int64
acidity_level                int64
tannin_level                 int64
body                        object
serving_temperature        float64
vintage_year                 int64
grape_variety               object
region                      object
price                      float64
food_pairing                object
aroma                       object
mouthfeel                   object
finish                      object
dtype: object

# 3. Load

In [20]:
import pyodbc
import psycopg2

In [21]:
#test connection
def load_table_psycopg2(df, table_name, schema_name):
    conn = psycopg2.connect(
        host=cfg.HOST,
        database=cfg.SERVER_NAME,  # Changed from DATABASE_NAME to SERVER_NAME
        user=cfg.USER_ID,
        password=cfg.USER_PASSWORD,
        port=cfg.PORT
    )
    # ... rest of the function remains the same ...

# Call the new function
load_table_psycopg2(df_product, 'products', 'dbo')


AttributeError: module 'db_config' has no attribute 'SERVER_NAME'

In [None]:
def get_column_types(df):
    # Map Pandas data types to SQL Server data types
    type_map = {
        'object': 'NVARCHAR(MAX)',
        'int64': 'BIGINT',
        'float64': 'DECIMAL(18, 5)',  # Use DECIMAL with precision and scale
        'datetime64[ns]': 'DATETIME2',
        'bool': 'BIT'
    }
    return [type_map.get(str(dt), 'NVARCHAR(MAX)') for dt in df.dtypes]


In [22]:
def load_table(df, table_name, schema_name):
    # Define the connection string
    conn_str = (
        'DRIVER={PostgreSQL Unicode};'
        F'SERVER={cfg.HOST};'
        F'DATABASE={cfg.DATABASE_NAME};'
        F'UID={cfg.USER_ID};'
        F'PWD={cfg.USER_PASSWORD};'
        F'PORT={cfg.PORT};'
    )

    # Create a pyodbc connection using the connection string
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()


    # Drop the table if it already exists
    cursor.execute(f"IF OBJECT_ID('{schema_name}.{table_name}', 'U') IS NOT NULL DROP TABLE {schema_name}.{table_name}")
    conn.commit()

    # Create the table schema
    columns = ', '.join([f'{col} {dtype}' for col, dtype in zip(df.columns, get_column_types(df))])
    create_table_sql = f"CREATE TABLE {schema_name}.{table_name} ({columns})"
    cursor.execute(create_table_sql)
    conn.commit()

    # Clean up the float columns in the DataFrame
    for col in df.select_dtypes(include=['float']):
        df[col] = df[col].round(5)  # Round to 5 decimal places

    # Ensure there are no NaN or None values in numeric columns
    df.fillna(0, inplace=True)  # You can adjust the fill value as necessary

    # Insert the data into the table
    insert_sql = f"INSERT INTO {schema_name}.{table_name} VALUES ({','.join(['?'] * len(df.columns))})"
    for _, row in df.iterrows():
        cursor.execute(insert_sql, *row.tolist())
    conn.commit()

    cursor.close()
    conn.close()
    print(f"DataFrame loaded successfully into {schema_name}.{table_name}")


In [None]:
# Now use psycopg2 instead of pyodbc
def load_table_psycopg2(df, table_name, schema_name):
    conn = psycopg2.connect(
        host=cfg.HOST,
        database=cfg.DATABASE_NAME,
        user=cfg.USER_ID,
        password=cfg.USER_PASSWORD,
        port=cfg.PORT
    )
    cursor = conn.cursor()

    try:
        # Check if the table exists
        cursor.execute(f"SELECT to_regclass('{schema_name}.{table_name}')")
        table_exists = cursor.fetchone()[0] is not None

        if table_exists:
            print(f"Table {schema_name}.{table_name} exists.")
        else:
            print(f"Table {schema_name}.{table_name} does not exist.")

    except psycopg2.Error as e:
        print(f"An error occurred: {e}")

    finally:
        cursor.close()
        conn.close()

# Call the new function
load_table_psycopg2(df_product, 'products', 'dbo')

NameError: name 'df_product' is not defined