Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import pyodbc
import sqlite3
import random
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans


Gather data

In [None]:
def getDbAsDF(conn: sqlite3.Connection, sql: str):
    cursor = conn.cursor()
    cursor.execute(sql)
    data = cursor.fetchall()

    columns = [column[0] for column in cursor.description]

    formatted_data = {}

    for i in range(len(columns)):
        dataList = []
        for j in data:
            dataList.append(j[i])
        formatted_data[columns[i]] = dataList

    df = pd.DataFrame(data=formatted_data, columns=columns)
    return df


# Connect to SQLite database
connection = sqlite3.connect(r'..\..\Data\merged.sqlite')

SQL = r"""
SELECT *
FROM order_header as OH
INNER JOIN retailer as RT ON OH.RETAILER_NAME = RT.COMPANY_NAME
INNER JOIN sales_demographic as SD on RT.RETAILER_CODEMR = SD.RETAILER_CODEMR
INNER JOIN order_details as OD ON OH.ORDER_NUMBER = OD.ORDER_NUMBER
INNER JOIN product as P ON P.PRODUCT_NUMBER = OD.PRODUCT_NUMBER
INNER JOIN product_type as PT ON PT.PRODUCT_TYPE_CODE = P.PRODUCT_TYPE_CODE
"""

DF = getDbAsDF(connection, SQL)
print(DF.columns)

DF[["UNIT_PRICE"]] = DF[["UNIT_PRICE"]].astype(float)
DF[["QUANTITY"]] = DF[["QUANTITY"]].astype(int)
DF

Selecting Data

In [None]:
Data = DF[["RETAILER_CODE","UNIT_PRICE","QUANTITY","AGE_GROUP_CODE"]]

Data = Data.T.drop_duplicates().T
Data.dropna(inplace = True)
Data

Clustering

In [None]:


kmeans = KMeans(n_clusters=4,init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0)

y_km = kmeans.fit(Data)


# Plot the data points and cluster centers
for company in (np.unique(np.concatenate(DF['COMPANY_NAME'].str.split(';\s*').values))):
    plt.scatter(
        Data["UNIT_PRICE"], Data["AGE_GROUP_CODE"],
        s=50, c=[random.random(),random.random(),random.random()],
        marker='v', edgecolor='black',
        label=company
    )
plt.title('K-means Clustering')
plt.scatter(
    kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
    s=250, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)


plt.legend(scatterpoints=1)
plt.grid()
plt.show()