In [2]:
import numpy as np
import pyodbc
import pandas as pd

In [6]:
# current
# connect to the SQL server database
conn = pyodbc.connect(
    '''
    DRIVER={ODBC Driver 17 for SQL Server};
    SERVER=DESKTOP-7CB1RAA;
    DATABASE=CompanyX;
    Trusted_Connection=yes;
    '''
    )

# create a cursor to fetch data
cursor = conn.cursor()

### Extract the table 1
Each row contain <Tid, s, t>, Tid is the transaction identifier, s is the set of item, t is the time epoch expressing the time that Tid occurs

![Table 1](img/table1.png)


In [56]:
query1 = """
select SalesOrderID as TiD,
	ProductID as Item
from [CompanyX].[Sales].[SalesOrderDetail]
group by SalesOrderID, ProductID
order by SalesOrderID
"""

cursor.execute(query1)
result = cursor.fetchall()
df1 = pd.DataFrame(np.asarray(result), columns=['TiD','Item'])
print(df1)


          TiD  Item
0       43659   716
1       43659   712
2       43659   771
3       43659   778
4       43659   774
...       ...   ...
121312  75122   712
121313  75122   878
121314  75123   712
121315  75123   879
121316  75123   878

[121317 rows x 2 columns]


In [57]:
# merge all the item with same TiD into one row
df1 = df1.groupby('TiD')['Item'].apply(list).reset_index()
print(df1)

         TiD                                               Item
0      43659  [716, 712, 771, 778, 774, 709, 777, 711, 773, ...
1      43660                                         [762, 758]
2      43661  [743, 712, 708, 716, 775, 747, 711, 742, 778, ...
3      43662  [730, 722, 726, 763, 753, 725, 738, 749, 733, ...
4      43663                                              [760]
...      ...                                                ...
31460  75119                                    [873, 921, 930]
31461  75120                                    [884, 878, 712]
31462  75121                                    [921, 707, 930]
31463  75122                                         [712, 878]
31464  75123                                    [712, 879, 878]

[31465 rows x 2 columns]


In [58]:
query2 = """
select SalesOrderID as TiD,
	OrderDate as EpocTime
from [CompanyX].[Sales].[SalesOrderHeader]
group by SalesOrderID, OrderDate 
order by SalesOrderID
"""

cursor.execute(query2)
result = cursor.fetchall()
df2 = pd.DataFrame(np.asarray(result), columns=['TiD','EpocTime'])
print(df2)

         TiD   EpocTime
0      43659 2011-05-31
1      43660 2011-05-31
2      43661 2011-05-31
3      43662 2011-05-31
4      43663 2011-05-31
...      ...        ...
31460  75119 2014-06-30
31461  75120 2014-06-30
31462  75121 2014-06-30
31463  75122 2014-06-30
31464  75123 2014-06-30

[31465 rows x 2 columns]


In [61]:
# perform inner join on the two dataframes
table1 = pd.merge(df1, df2, on='TiD', how='inner')
print(table1)   

         TiD                                               Item   EpocTime
0      43659  [716, 712, 771, 778, 774, 709, 777, 711, 773, ... 2011-05-31
1      43660                                         [762, 758] 2011-05-31
2      43661  [743, 712, 708, 716, 775, 747, 711, 742, 778, ... 2011-05-31
3      43662  [730, 722, 726, 763, 753, 725, 738, 749, 733, ... 2011-05-31
4      43663                                              [760] 2011-05-31
...      ...                                                ...        ...
31460  75119                                    [873, 921, 930] 2014-06-30
31461  75120                                    [884, 878, 712] 2014-06-30
31462  75121                                    [921, 707, 930] 2014-06-30
31463  75122                                         [712, 878] 2014-06-30
31464  75123                                    [712, 879, 878] 2014-06-30

[31465 rows x 3 columns]


In [62]:
# export the table to a csv file
table1.to_csv('table1.csv', index=False)
# TODO: one hot encoding for the items later

### Extract the table 2
Each row contain <i, dr, t>, i is the item, dr is the discount rate, t is the period in which item i has dr discount

![Table 2](img/table2.png)