In [3]:
import pandas as pd
from dateutil.parser import parse
from sqlalchemy import create_engine
from sqlalchemy import text
import sqlite3

In [4]:
data = "Superstore.csv"

In [5]:
# Create a database engine
engine = create_engine("sqlite:///superstore.db")

In [6]:
# reading the dataset
df = pd.read_csv(data, encoding='latin1')

In [7]:
# Insert the data into the database
def insert_db(df, table_name, engine):
    df.to_sql(table_name, con=engine, if_exists='replace', index=False)

In [8]:
#Inserting raw data
insert_db(df, data[:-4], engine)

In [9]:
# Tables in database
table = pd.read_sql_query("SELECT name from sqlite_master where type = 'table'", engine)
table

Unnamed: 0,name
0,Superstore_cleaned
1,Superstore


In [10]:
# Reading data from database
ds = pd.read_sql_query("SELECT * FROM Superstore", engine)
ds_tmp = ds       # Creating a temporary dataset for further process
ds_tmp.head(3)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11-08-2016,11-11-2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11-08-2016,11-11-2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,06-12-2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714


In [11]:
# Checking on columns info
ds_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity

In [12]:
# Changing date columns type and setting a default format
def format_date(column):
    try:
        dt = parse(column, dayfirst=True)
        return dt.strftime("%d-%m-%Y")
    except:
        try:
            dt = parse(column, dayfirst=False)
            return dt.strftime("%d-%m-%Y")
        except:
            return pd.NaT

# Changing data type and formatting different types of date values to one type
ds_tmp["Order Date"] = ds_tmp["Order Date"].apply(format_date)
ds_tmp["Order Date"] = pd.to_datetime(ds_tmp["Order Date"], errors='coerce', dayfirst=True)

In [13]:
ds_tmp.sample(3)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
4795,4796,CA-2015-120516,2015-08-13,8/17/2015,Standard Class,CK-12595,Clytie Kelty,Consumer,United States,Marietta,...,30062,South,OFF-BI-10004187,Office Supplies,Binders,3-ring staple pack,5.64,3,0.0,2.7072
9594,9595,CA-2017-108931,2017-09-14,9/19/2017,Standard Class,HZ-14950,Henia Zydlo,Consumer,United States,New York City,...,10024,East,OFF-PA-10003845,Office Supplies,Paper,Xerox 1987,11.56,2,0.0,5.6644
750,751,CA-2017-126074,2017-02-10,10-06-2017,Standard Class,RF-19735,Roland Fjeld,Consumer,United States,Trenton,...,48183,Central,FUR-FU-10003577,Furniture,Furnishings,Nu-Dell Leatherette Frames,157.74,11,0.0,56.7864


In [14]:
# Apply date format for Ship Date column
ds_tmp["Ship Date"] = ds_tmp["Ship Date"].apply(format_date)
ds_tmp["Ship Date"] = pd.to_datetime(ds_tmp["Ship Date"], errors='coerce', dayfirst=True)

In [15]:
# Unique values 
ds_tmp["Sub-Category"].unique()

array(['Bookcases', 'Chairs', 'Labels', 'Tables', 'Storage',
       'Furnishings', 'Art', 'Phones', 'Binders', 'Appliances', 'Paper',
       'Accessories', 'Envelopes', 'Fasteners', 'Supplies', 'Machines',
       'Copiers'], dtype=object)

In [16]:
# Calculating customer churn according to the order date
def calculate_averageCustomerChurn():
    customer_churnDays_map = {}
    for customerName in ds_tmp["Customer Name"].unique():
        customer_purchase_dates = ds_tmp[ds_tmp["Customer Name"] == customerName]["Order Date"].reset_index().sort_values("Order Date")
        last_customer_purchase = pd.to_datetime(customer_purchase_dates["Order Date"].max())
        diffs = customer_purchase_dates["Order Date"].diff().dt.days.fillna(0)
        average_customer_churn_days = round(((ds_tmp["Order Date"].max()-last_customer_purchase).days+int(diffs.values.sum()))/len(customer_purchase_dates), 0)
        customer_churnDays_map[customerName] = average_customer_churn_days
    return customer_churnDays_map

In [17]:
# Creating aggregated churned feature 
customer_average_churnData = calculate_averageCustomerChurn()
def label_churn(group):
    name = group["Customer Name"].iloc[0]
    customer_thresholdChurnDays = customer_average_churnData.get(name, 60)

    # Sort and compute next purchase
    group = group.sort_values(by="Order Date")
    next_purchase = group["Order Date"].shift(-1)
    overall_last_date = ds_tmp["Order Date"].max()
    next_purchase.fillna(overall_last_date, inplace=True)

    gap = (next_purchase - group["Order Date"]).dt.days
    return gap.apply(lambda x: "Churned" if x >= customer_thresholdChurnDays else "Not Churned")

ds_tmp["Customer Churn"] = ds_tmp.groupby("Customer Name", group_keys=False).apply(label_churn)   

  ds_tmp["Customer Churn"] = ds_tmp.groupby("Customer Name", group_keys=False).apply(label_churn)


In [18]:
ds_tmp.sample(3)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Customer Churn
1801,1802,CA-2016-149461,2016-11-13,2016-11-19,Standard Class,AS-10135,Adrian Shami,Home Office,United States,Auburn,...,West,FUR-FU-10004270,Furniture,Furnishings,"Eldon Image Series Desk Accessories, Burgundy",4.18,1,0.0,1.5048,Churned
340,341,CA-2014-122336,2014-04-13,2014-04-17,Second Class,JD-15895,Jonathan Doherty,Corporate,United States,Philadelphia,...,East,OFF-AR-10000122,Office Supplies,Art,Newell 314,17.856,4,0.2,1.116,Not Churned
9205,9206,CA-2016-146423,2016-04-21,2016-04-21,Same Day,BT-11680,Brian Thompson,Consumer,United States,Milford,...,East,OFF-AR-10004817,Office Supplies,Art,Colorific Watercolor Pencils,15.48,3,0.0,4.4892,Churned


In [19]:
ds_tmp.describe()

Unnamed: 0,Row ID,Order Date,Ship Date,Postal Code,Sales,Quantity,Discount,Profit
count,9994.0,9994,9994,9994.0,9994.0,9994.0,9994.0,9994.0
mean,4997.5,2016-04-11 07:17:44.078447104,2016-04-20 15:38:17.458475008,55190.379428,229.858001,3.789574,0.156203,28.656896
min,1.0,2014-01-02 00:00:00,2014-01-04 00:00:00,1040.0,0.444,1.0,0.0,-6599.978
25%,2499.25,2015-05-01 00:00:00,2015-05-07 00:00:00,23223.0,17.28,2.0,0.0,1.72875
50%,4997.5,2016-05-30 00:00:00,2016-06-12 00:00:00,56430.5,54.49,3.0,0.2,8.6665
75%,7495.75,2017-04-09 00:00:00,2017-04-29 00:00:00,90008.0,209.94,5.0,0.2,29.364
max,9994.0,2017-12-30 00:00:00,2018-05-01 00:00:00,99301.0,22638.48,14.0,0.8,8399.976
std,2885.163629,,,32063.69335,623.245101,2.22511,0.206452,234.260108


In [20]:
# Making a connection
conn = sqlite3.connect("superstore.db")

In [21]:
cursor = conn.cursor()

In [22]:
# Creating the skeleton for the table creation
cursor.execute("""
    CREATE TABLE IF NOT EXISTS Superstore_cleaned (
        "Row ID" INT NOT NULL,
        "Order ID" VARCHAR(20) NOT NULL, 
        "Order Date" DATE,
        "Ship Date" DATE,
        "Ship Mode" VARCHAR(20), 
        "Customer ID" VARCHAR(20) NOT NULL, 
        "Customer Name" VARCHAR(40), 
        "Segment" VARCHAR(20),
        "Country" VARCHAR(20),
        "City" VARCHAR(30),
        "State" VARCHAR(30),
        "Postal Code" VARCHAR(30), 
        "Region" VARCHAR(30),
        "Product ID" VARCHAR(20), 
        "Category" VARCHAR(20),
        "Sub-Category" VARCHAR(20), 
        "Product Name" VARCHAR(50), 
        "Sales" DECIMAL(15, 2),
        "Quantity" INT,
        "Discount" DECIMAL(10, 2),
        "Profit" DECIMAL(15, 2),
        "Customer Churn" VARCHAR(20),
        PRIMARY KEY("Order ID", "Customer ID")
    );
""")

# Insert table and data into database
insert_db(ds_tmp, "Superstore_cleaned", conn)

In [23]:
# Saving the cleaned data into csv
df = pd.read_sql_query("SELECT * FROM Superstore_cleaned", conn)
df.to_csv("Superstore cleaned.csv", index=False)