In [35]:
import pandas as pd
import numpy as np
from datetime import date

In [36]:
#Loading the data
today = str(date.today())
df_branch_service = pd.read_json("branch_service_transaction_info.json")
df_customer_transaction = pd.read_json("customer_transaction_info.json")

In [37]:
#Profiling the data
df_branch_service.head(10)

Unnamed: 0,txn_id,branch_name,service,price
0,TXN-24546,MallOfAsia,Manicure,
1,TXN-14642,Starmall,HairColor,
2,TXN-60295,SmallMall,FootSpa,
3,TXN-60295,Starmall,FootSpa,
4,TXN-60295,MayMall,FootSpa,
5,TXN-60295,FrankMall,FootSpa,
6,TXN-40462,Starmall,HairColor,
7,TXN-40462,Megamall,HairColor,
8,TXN-08102,RobinsonsMall,HairColor,
9,TXN-08102,SmallMall,HairColor,


In [38]:
df_customer_transaction.head(10)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ORTIZ,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,NIENOW,LEA,2000-11-26
2,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
3,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
4,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
5,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
6,TXN-40462,2021-08-21,KUHN,TOD,2002-11-25
7,TXN-40462,2021-08-21,KUHN,TOD,2002-11-25
8,TXN-08102,2010-04-03,JOHNSON,MILTON,2003-07-10
9,TXN-08102,2010-04-03,JOHNSON,MILTON,2003-07-10


In [39]:
print(df_branch_service.shape)
print(df_customer_transaction.shape)

(130653, 4)
(130653, 5)


In [40]:
print(df_branch_service['txn_id'].nunique())
print(df_customer_transaction['txn_id'].nunique())

62354
62354


In [41]:
#Dropping duplicates
df_branch_service = df_branch_service.drop_duplicates(subset=['txn_id'])
df_customer_transaction = df_customer_transaction.drop_duplicates(subset=['txn_id'])

In [42]:
#confirming shape
print(df_branch_service.shape)
print(df_customer_transaction.shape)

(62354, 4)
(62354, 5)


In [43]:
#checking null values count per column
df_branch_service.isnull().sum()

txn_id             0
branch_name     8880
service            0
price          11082
dtype: int64

In [44]:
df_customer_transaction.isnull().sum()

txn_id        0
avail_date    0
last_name     0
first_name    0
birthday      0
dtype: int64

In [45]:
#fill up null values with forward fill for branch_name
df_branch_service['branch_name'].unique()

array(['MallOfAsia', 'Starmall', 'SmallMall', 'RobinsonsMall', '', None,
       'MayMall', 'FrankMall', 'N/A', 'Megamall'], dtype=object)

In [46]:
#'' and None are empty values, fill them with forward fill
# first, let '' be null
df_branch_service['branch_name'] = df_branch_service.replace('',np.nan).groupby('txn_id')['branch_name'].transform('first')
# then fill null, use forward and backward to fill
df_branch_service['branch_name'] = df_branch_service['branch_name'].ffill().bfill()

In [47]:
#confirming branch_name has no null
df_branch_service.isnull().sum()

txn_id             0
branch_name        0
service            0
price          11082
dtype: int64

In [48]:
df_branch_service['branch_name'].unique()

array(['MallOfAsia', 'Starmall', 'SmallMall', 'RobinsonsMall', 'MayMall',
       'FrankMall', 'N/A', 'Megamall'], dtype=object)

In [49]:
# group mean fill prices
df_branch_service['price'] = df_branch_service['price'].fillna(df_branch_service.groupby(['branch_name','service'])['price'].transform('mean'))

In [50]:
#confirming price has no null
df_branch_service.isnull().sum()

txn_id         0
branch_name    0
service        0
price          0
dtype: int64

In [51]:
df_branch_service.head(10)

Unnamed: 0,txn_id,branch_name,service,price
0,TXN-24546,MallOfAsia,Manicure,42.651993
1,TXN-14642,Starmall,HairColor,69.583267
2,TXN-60295,SmallMall,FootSpa,80.037329
6,TXN-40462,Starmall,HairColor,69.583267
8,TXN-08102,RobinsonsMall,HairColor,66.735226
12,TXN-64262,SmallMall,Rebond,312.646002
15,TXN-41934,RobinsonsMall,HairColor,66.735226
19,TXN-39050,Starmall,Manicure,43.364281
21,TXN-43150,Starmall,Haircut,51.769018
22,TXN-49423,RobinsonsMall,Rebond,302.828204


In [52]:
df_merged = pd.merge(df_customer_transaction, df_branch_service)

In [53]:
#profiling of merged dataframe
df_merged.isnull().sum()

txn_id         0
avail_date     0
last_name      0
first_name     0
birthday       0
branch_name    0
service        0
price          0
dtype: int64

In [54]:
df_merged.sample(20)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday,branch_name,service,price
21946,TXN-29260,2027-08-28,wiegand,braden,2001-08-08,Starmall,HairColor,0.0
46960,TXN-15296,2021-03-05,Schamberger,Albina,1991-04-02,Megamall,Haircut,66.123457
24235,TXN-51684,2015-12-30,Heidenreich,Bradley,2001-01-16,Starmall,NailColor,30.12379
18574,TXN-16459,2015-08-08,spencer,jaylin,1990-10-02,Megamall,Haircut,0.0
24315,TXN-57405,2012-01-16,Daniel,Monroe,1999-11-25,MallOfAsia,Haircut,66.123457
857,TXN-21600,2024-05-20,JAKUBOWSKI,LYRIC,1990-04-11,SmallMall,Haircut,51.276746
60596,TXN-18997,2012-12-24,Kuhlman,Ebba,2001-02-19,RobinsonsMall,FootSpa,100.12123
17158,TXN-42402,2017-12-31,christiansen,shannon,2002-08-16,SmallMall,Pedicure,0.0
281,TXN-14505,2014-12-30,KEEBLER,RAFAEL,2005-01-12,FrankMall,HairColor,67.045741
25638,TXN-52627,2026-07-11,Kozey,Lessie,1995-03-06,RobinsonsMall,NailColor,30.12379


In [55]:
#filter alphabet only in last_name and first_name
df_merged['last_name'] = df_merged['last_name'].str.replace('\W', '', regex=True)
df_merged['first_name'] = df_merged['first_name'].str.replace('\W', '', regex=True)

  df_merged['last_name'] = df_merged['last_name'].str.replace('\W', '', regex=True)
  df_merged['first_name'] = df_merged['first_name'].str.replace('\W', '', regex=True)


In [56]:
#uppercase for last_name and first_name
df_merged['last_name'] = df_merged['last_name'].str.upper()
df_merged['first_name'] = df_merged['first_name'].str.upper()

In [57]:
#check if no special characters and all uppercase for names
df_merged.sample(20)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday,branch_name,service,price
28271,TXN-22367,2024-03-23,FRIESEN,EARNEST,2001-06-03,MallOfAsia,HairColor,88.09393
40593,TXN-45127,2006-06-07,ORN,JUANITA,1996-04-22,Megamall,Haircut,66.123457
25033,TXN-15683,2009-03-27,HERMISTON,STEPHANY,1990-08-03,SmallMall,NailColor,30.12379
30251,TXN-60547,2014-04-13,SCHADEN,CARMELO,2009-09-30,Starmall,Haircut,66.123457
13636,TXN-08180,2010-04-13,HAHN,DANNY,2009-08-08,MallOfAsia,FootSpa,0.0
8299,TXN-10494,2010-11-15,FRANECKI,KAMRYN,1999-01-18,RobinsonsMall,Manicure,41.900441
45112,TXN-51633,2012-02-27,BREITENBERG,JARRED,1997-02-04,MayMall,Pedicure,77.987989
9014,TXN-56073,2028-09-04,RYAN,JEANIE,2007-04-03,Starmall,Pedicure,61.537398
30649,TXN-10352,2018-06-02,HICKLE,LEXUS,2008-01-01,MallOfAsia,FootSpa,100.12123
51801,TXN-58437,2027-02-17,HARBER,NEAL,1997-02-13,MayMall,NailColor,30.12379


In [58]:
#profiling date columns
print(df_merged['birthday'].min())
print(df_merged['birthday'].max())

print(df_merged['avail_date'].min())
print(df_merged['avail_date'].max())

print(df_merged['avail_date'].describe)
print(df_merged['birthday'].describe)

print(df_merged[(df_merged['avail_date'] <= df_merged['birthday'])])

1990-01-01
2010-12-30
2005-01-01
2030-12-30
<bound method NDFrame.describe of 0        2030-09-08
1        2026-05-26
2        2006-09-25
3        2021-08-21
4        2010-04-03
            ...    
62349    2022-11-21
62350    2020-04-14
62351    2030-01-26
62352    2021-02-27
62353    2025-01-05
Name: avail_date, Length: 62354, dtype: object>
<bound method NDFrame.describe of 0        1990-07-08
1        2000-11-26
2        1993-05-22
3        2002-11-25
4        2003-07-10
            ...    
62349    1997-03-06
62350    1996-05-28
62351    1996-05-28
62352    1995-03-29
62353    1993-01-10
Name: birthday, Length: 62354, dtype: object>
          txn_id  avail_date  last_name first_name    birthday    branch_name  \
9      TXN-49423  2008-08-13      BERGE    RASHEED  2010-02-05  RobinsonsMall   
17     TXN-35244  2005-11-23     HAMMES     EUNICE  2006-06-13      SmallMall   
97     TXN-23898  2005-05-28   BOTSFORD     GIANNI  2009-03-30  RobinsonsMall   
101    TXN-17974  2008-04-12  

In [59]:
#data type of birthday and avail_date should be datetime instead of object
df_merged['avail_date'] = pd.to_datetime(df_merged['avail_date'], format='%Y-%m-%d')
df_merged['birthday'] = pd.to_datetime(df_merged['birthday'], format='%Y-%m-%d')

#confirming
print(df_merged['avail_date'].describe)
print(df_merged['birthday'].describe)

<bound method NDFrame.describe of 0       2030-09-08
1       2026-05-26
2       2006-09-25
3       2021-08-21
4       2010-04-03
           ...    
62349   2022-11-21
62350   2020-04-14
62351   2030-01-26
62352   2021-02-27
62353   2025-01-05
Name: avail_date, Length: 62354, dtype: datetime64[ns]>
<bound method NDFrame.describe of 0       1990-07-08
1       2000-11-26
2       1993-05-22
3       2002-11-25
4       2003-07-10
           ...    
62349   1997-03-06
62350   1996-05-28
62351   1996-05-28
62352   1995-03-29
62353   1993-01-10
Name: birthday, Length: 62354, dtype: datetime64[ns]>


In [60]:
#removing rows with later date than current date
df_merged = df_merged[(df_merged['avail_date'] <= today) & (df_merged['birthday'] <= today)]

In [61]:
#removing rows when avail_date happens before birthday
df_merged = df_merged[(df_merged['avail_date'] > df_merged['birthday'])]

In [62]:
#confirming
print(df_merged['birthday'].max())
print(df_merged['avail_date'].max())

print(df_merged[(df_merged['avail_date'] <= df_merged['birthday'])])

2010-12-30 00:00:00
2023-11-23 00:00:00
Empty DataFrame
Columns: [txn_id, avail_date, last_name, first_name, birthday, branch_name, service, price]
Index: []


In [63]:
df_merged.sample(20)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday,branch_name,service,price
54630,TXN-45593,2017-03-07,HICKLE,QUENTIN,2009-10-17,MayMall,HairColor,88.09393
21852,TXN-65431,2008-03-11,LUEILWITZ,LELIA,1998-03-13,SmallMall,Pedicure,0.0
5155,TXN-34399,2019-07-14,PARKER,JEREMIE,1991-03-02,MayMall,NailColor,22.7163
29039,TXN-50722,2022-08-30,YUNDT,GRETCHEN,2007-06-02,Starmall,NailColor,30.12379
25174,TXN-14299,2009-01-19,WUCKERT,TERENCE,1991-12-12,RobinsonsMall,Manicure,55.2324
25902,TXN-22920,2009-12-09,GLEASON,JERAMY,2004-02-13,Starmall,Haircut,66.123457
1988,TXN-23014,2010-01-08,DIBBERT,GLENDA,1993-04-30,RobinsonsMall,FootSpa,75.620543
25827,TXN-00282,2017-03-19,HANSEN,ANIKA,2005-08-28,MayMall,HairColor,88.09393
11665,TXN-64803,2017-03-10,MARQUARDT,KAY,1992-08-12,RobinsonsMall,Haircut,0.0
4889,TXN-06643,2022-04-20,HILPERT,JACEY,2009-02-01,Megamall,HairColor,69.96471


In [64]:
df_merged.shape

(43136, 8)

In [81]:
!pip install psycopg2-binary

Collecting psycopg2-binary
  Using cached psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl.metadata (4.6 kB)
Using cached psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl (1.2 MB)
Installing collected packages: psycopg2-binary


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\juanc\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\psycopg2\\_psycopg.cp312-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [82]:
import psycopg2

In [84]:
conn = psycopg2.connect("dbname=transaction user=postgres password=postgres")

In [85]:
cur = conn.cursor()

In [86]:
cur.execute("create table if not exists transaction_table (txn_id varchar(45) not null primary key,avail_data date, last_name varchar(20),birthday date,branch_name varchar(30), service varchar(30), price double precision)")