## Library Import

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import psycopg2
from psycopg2 import Error
import os
from dotenv import load_dotenv
import datetime as dt

## SQL Connection Set UP

In [4]:
# Read in ENV credentials 

load_dotenv()

username = os.getenv('USERNAME')
password = os.getenv('PASSWORD')

SpiceyDice@345


In [3]:
# Connection to db 
USER = username
PSWD = password
HOST = "pg.analytics.northwestern.edu"
PORT = "5432"
DB_NAME = "everything2023"

# Connect to postgress database
try:
    # Connect to an existing database
    connection = psycopg2.connect(user = USER,
                                  password = PSWD,
                                  host = HOST,
                                  port = PORT,
                                  database = DB_NAME)
    cursor = connection.cursor()
    cursor.execute("SELECT version();")
    record = cursor.fetchone()
    print("You are connected to - ", record, "\n")
    
except (Exception, Error) as error:
    print("Error while connecting to PostgreSQL", error)

You are connected to -  ('PostgreSQL 10.12 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-39), 64-bit',) 



## FEATURE SELECTION

#### get Y label for return rate per store per day (ReturnRate is our y_label)

#### feature 1: number of transactions per store per day

#### feature 2: number of discount sale per day

#### feature 3: amount of money sale per day

#### Ideas of features
#### STATE /location  -- i ma not sure how to use (visualization)
#### SKU properties (price of SKU)  
#### time/year
#### keep brainstorming.....

In [None]:
# Read trnsact table
sql_cmd = """
SELECT 
  "SALEDATE", 
  "STORE", 
  SUM(CASE WHEN "STYPE" = 'R' THEN 1.0 ELSE 0.0 END) / 
  (SUM(CASE WHEN "STYPE" = 'P' THEN 1.0 ELSE 0.0 END) + SUM(CASE WHEN "STYPE" = 'R' THEN 1.0 ELSE 0.0 END)) as "ReturnRate",
  count(*) as "NumTrans",
  SUM(CASE WHEN "SPRICE" < "ORGPRICE" THEN 1 ELSE 0 END) as "DiscountSalesCount",
  SUM("AMT") as "AmtTrans"
  
FROM 
  group_14.trnsact
GROUP BY 
  "SALEDATE", 
  "STORE"
ORDER BY 
  "SALEDATE", 
  "STORE";
"""
return_rate_df = pd.read_sql(sql_cmd, connection);



In [None]:
return_rate_df

In [None]:
sql_cmd = """
SELECT "STATE", "STORE"
FROM 
  group_14.strinfo

"""
strinfo = pd.read_sql(sql_cmd, connection);
strinfo.head()

In [None]:
return_states = pd.merge(return_rate_df, strinfo, on="STORE", how='inner')
state_counts = pd.DataFrame(return_states["STATE"].value_counts().head(10)).reset_index()
state_counts.columns = ["STATE", "RETURN_COUNT"]

plt.bar(state_counts["STATE"], state_counts["RETURN_COUNT"])
plt.xlabel("State")
plt.ylabel("Return Count")
plt.title("Return Counts by State")
plt.xticks(rotation=45)
plt.show()

The top 10 states with the most returns 

In [None]:
# Extract different time features 
return_rate_df['Month'] = pd.to_datetime(return_rate_df['SALEDATE']).dt.month
return_rate_df['Quarter'] = pd.to_datetime(return_rate_df['SALEDATE']).dt.quarter
return_rate_df['Day'] = pd.to_datetime(return_rate_df['SALEDATE']).dt.dayofweek
return_rate_df

In [None]:
# JOIN skstinfo and skuinfo to get cost of sku 
sql_cmd = """
SELECT sku."SKU", sks."Cost"
FROM group_14.skstinfo sks
JOIN group_14.skuinfo sku on sku."SKU" = sks."SKU"
"""
skstinfo = pd.read_sql(sql_cmd, connection);
skstinfo.head()

## Data Visualization

In [None]:
## how return rate for store 102 working in times series
def plot_return_for_store(store_num):
    # Filter the DataFrame for store 102
    store_data = return_rate_df[return_rate_df['STORE'] == store_num]
    
    # Ensure SaleDate is a datetime type for proper plotting
    store_data['SALEDATE'] = pd.to_datetime(store_data['SALEDATE'])
    
    # Plotting
    plt.figure(figsize=(10, 5))  # You can adjust the figure size as needed
    plt.plot(store_data['SALEDATE'], store_data['ReturnRate'], marker='o')
    
    plt.title(f'Return Rate Over Time for Store {store_num}')
    plt.xlabel('Sale Date')
    plt.ylabel('Return Rate')
    plt.grid(True)
    plt.xticks(rotation=45)  # Rotate the x-axis labels for better readability
    plt.tight_layout()  # Adjusts plot parameters to give the plot more room
    
    # Display the plot
    plt.show()

In [None]:
plot_return_for_store(203)