# Goals

1) Aquire transaction data from a local game store for the years 2021-23 <br>
2) Prepare data for use in future projects <br>
3) Document preperation process <br>

In [16]:
# Imports

# libraries
import pandas as pd
import regex as re

import os

# writen files
import cata_lists.accessories as a
import cata_lists.board_games as b
import cata_lists.concessions as c
import cata_lists.paint_supplies as p
import cata_lists.rpg as r
import cata_lists.table_minis as m
import cata_lists.tcg as t
import cata_lists.other as o
import wrangle as w

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Read in Data and Create Dataframe

In [None]:
# combine annual csv files into one dataframe

df_2021 = pd.read_csv('raw_data/2021-2022.csv').sort_values('Date')
df_2022 = pd.read_csv('raw_data/2022-2023.csv').sort_values('Date')
df_2023 = pd.read_csv('raw_data/2023-2024.csv').sort_values('Date')


df = pd.concat([df_2021, df_2022, df_2023]).sort_values('Date')

Merged dataframe contains:
* 19,721 rows, each representing a transaction occuring between 2021 and 2023
* 51 columns providing details about those transactions

# Drop Irrelevant Colunms
Future projects will focus on finding patterns in customer purchasing behavior, so I will restrict the data frame only to coluns that are relevant to that purpose

In [None]:
# remake df with only relevant columns

df = df[['Date',
         'Time',
         'Gross Sales',
         'Discounts',
         'Net Sales',
         'Customer ID', 
         'Description', 
         'Discount Name',
         'Event Type']]

# Rename Columns
Columns were renamed for clarity ease of use

In [None]:
df = df.rename(columns = {'Date' : 'date',
                          'Time' : 'time',
                          'Gross Sales' : 'gross_sales',
                          'Discounts': 'discount_amount',
                          'Net Sales' : 'net_sales',
                          'Customer ID' : 'cust_id', 
                          'Description' : 'cart', 
                          'Discount Name' : 'discount_type',
                          'Event Type' : 'event_type'})

After dropping and renaming data contains the following columns
* date
* time
* gross_sales
* discount_amount
* net_sales
* cust_id, id number for customer making purchase
* cart, string containing names of items bought
* discount_type, type of discount applied to purchase
* event_type, type of transaction

# Handle Null Values, and Check Column Data Types
<br>
* 3445 nulls in cust_id were imputed with 'unknown'<br>
* 10236 nulls in discount_type were imputed with 'No Discount'<br>
* 3 rows were dropped that contained null values in cart and no other useful data
* Cleaned columns containing dollor amounts and converted them to float

In [None]:
# handle null values
df.cust_id = df.cust_id.fillna('unknown')

df.discount_type = df.discount_type.fillna('no_discount')

df = df.dropna(subset=['cart'])

# convert dollors from string to float
df['net_sales'] = df['net_sales'].str.replace('$', '').str.replace(',', '').astype(float)
df['gross_sales'] = df['gross_sales'].str.replace('$', '').str.replace(',', '').astype(float)
df['discount_amount'] = df['discount_amount'].str.replace('$', '').str.replace(',', '').astype(float)

# Set Datetime as Index and add Time Derivative Columns

In [None]:
# create datetime column and set it as the index
df['datetime'] = df.date + ' ' + df.time
df['datetime'] = pd.to_datetime(df['datetime'])

df = df.set_index('datetime').sort_index()

df = df.drop(columns = ['date', 'time'])

# get time derivative columns
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['weekday'] = df.index.day_name()  

# Get list of All Unique Items Appearing in Cart
<br>
Clean values in cart<br>
* Convert to lowercase<br>
* Remove punctuation<br>
* Reduce wordiness<br>
* Remove space before and after text<br>
* Replace spaces between words with hyphens




In [None]:
# clean values in cart to remove extra punctuation and wordiness

def clean_text_in_cart(value):
    
    clean_items = []
    
    items = value.split(',')
    
    for item in items:
    
        item = (item.lower()
                    .replace('(regular)', '')
                    .replace('  - too much caffeine', '')
                    .replace('  - carbonated beverage', ''))
                  
        item = re.sub(r'[^a-z0-9\s_]', '' , item)
    
        item = item.strip().replace(' ','_')
    
        item = re.sub('_+', '_', item)

        clean_items.append(item)
    
    value = ','.join(clean_items)
    
    return value
    

df['cart'] = df['cart'].apply(clean_text_in_cart)


# Get string of all values in cart seperated by commas

items = ''

for value in df['cart']:
    
    items += ',' + value
    
# get master list by splitting the string on comma and stripping the resulting values
master_list = list(set([re.sub(r'\d+_x_', '', item) for item in items.split(',') if item != '']))

master_list.sort()

# Add Item Count Columns

Add a column for each item in unique items list showing the number of that item involved in each transaction

In [None]:
def get_number_of_items(value, item):
    
    pattern = rf"(((\d+)_x_)?{re.escape(item)}\b)"

    matches = re.findall(pattern, value)

    total = sum([int(match[-1]) if match[-1].isdigit() == True else 1 for match in matches ])

    return total 

for item in master_list:
    
    df[item] = df.cart.apply(get_number_of_items, args=(item,))

# Add Category Count Columns

Manually divide items in master list into category lists using the following definitions:

|Category|Description|Examples|
|--------|-----------|--------|
|Accessories|Items that enhance game play or are used to store game play items|Binders, Dice, Card Sleeves|
|Board Games|Self contained board games and board game expansions|Terraforming Mars, LOTR Journies in Middle Earth|
|Concessions|Food and drink items|drinks, candy|
|Minis/Models|Miniature models, contained in customizable table top minis games or sold as stand alone minis or sets of minis Does not include boardgames that contain minis|Warhammer Minis, D&D Minis|
|Modeling Supplies|Items used to enhance appearance of minis/models|Painting Supplies, Model Bases|
|Role Playing Games|Books and map packs for Role Playing Games|Dungeons and Dragons Books, Pathfinder Books|
|Trading Card Games|Cards for customizable card games|Magic, Pokemon, Yugio|
|Game Room Rental|Items that relate to renting the game room| n/a|
|Other| Items that could not be classified|Custom Amount|

Store lists in cata_lists folder

Add a column to the dataframe that shows the number of items in each category involved in each transaction

In [None]:
# seperate item count columns by category into different dataframes
df_acc = df[a.accessory_list]
df_bg = df[b.board_game_list]
df_con = df[c.concessions_list]
df_ps = df[p.paint_supplies_list]
df_rpg = df[r.rpg_list]
df_tm = df[m.table_minis_list]
df_tcg = df[t.tcg_list]
df_other = df[o.other_list]
df_room = df[o.game_room_list]
df_master = df[master_list]

# get total for each and add them to the original dataframe
df['accessories'] = df_acc.sum(axis=1)
df['board_games'] = df_bg.sum(axis=1)
df['concessions'] = df_con.sum(axis=1)
df['modeling_supplies'] = df_ps.sum(axis=1)
df['role_playing_games'] = df_rpg.sum(axis=1)
df['minis_models'] = df_tm.sum(axis=1)
df['trading_card_games'] = df_tcg.sum(axis=1)
df['other'] = df_other.sum(axis=1)
df['game_room_rental'] = df_room.sum(axis=1)
df['all_items'] = df_master.sum(axis=1)

# Save Dataframe to CSV File

In [None]:
df.to_csv('prepared_store_data.csv', index_label=False)

# Refactor Wrangle Code into Functions and Store in a Wrangle.py File

In [19]:
df = w.get_prepared_data()

  df['net_sales'] = df['net_sales'].str.replace('$', '').str.replace(',', '').astype(float)
  df['gross_sales'] = df['gross_sales'].str.replace('$', '').str.replace(',', '').astype(float)
  df['discount_amount'] = df['discount_amount'].str.replace('$', '').str.replace(',', '').astype(float)
  df[item] = df.cart.apply(get_number_of_items, args=(item,))


In [20]:
df

Unnamed: 0_level_0,gross_sales,discount_amount,net_sales,cust_id,cart,discount_type,event_type,year,month,day,...,accessories,board_games,concessions,modeling_supplies,role_playing_games,minis_models,trading_card_games,other,game_room_rental,all_items
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01 15:07:02,11.99,0.00,11.99,T49C25V8WS37VB4RSJTBN13TSR,dragon_shield_sleeves_matte_blue,no_discount,Payment,2021,1,1,...,1,0,0,0,0,0,0,0,0,1
2021-01-01 15:09:20,5.99,0.00,5.99,F36VZJMBMH3PH5SN664GPV4NQR,ultra_pro100_deck_box_white_2020_aw12892,no_discount,Payment,2021,1,1,...,1,0,0,0,0,0,0,0,0,1
2021-01-01 16:30:31,31.25,0.00,31.25,BBBRQPD57S3YV4GJG1CCB6ATGC,"candy,dungeons_dragons_icewind_dale,rime_of_th...",no_discount,Payment,2021,1,1,...,0,0,1,0,1,1,0,0,0,3
2021-01-01 16:31:27,21.25,0.00,21.25,W67T53BZFD5VQ8J3W7387X8SW0,"candy,double_sided_battlemap_chx96246",no_discount,Payment,2021,1,1,...,0,0,1,0,0,1,0,0,0,2
2021-01-01 19:36:24,257.14,-25.50,231.64,F4RYR1ARCD7ZDD4E9PFYRVF6H0,"dex_binder,3_x_custom_amount,candy,soda,most_d...",Military,Payment,2021,1,1,...,1,0,2,0,0,0,0,4,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 16:06:59,23.73,-3.56,20.17,9QF5CQDWN165DD0QQTJDESHQN0,"gwco_ultramarines_blue_2918,gwla_thunderhawk_b...",15% off entire sale (up to $15.00 off),Payment,2023,12,31,...,0,0,1,3,0,0,0,0,0,4
2023-12-31 17:07:49,12.98,0.00,12.98,unknown,"wilds_of_eldraine_set_booster,streets_of_new_c...",no_discount,Payment,2023,12,31,...,0,0,0,0,0,0,2,0,0,2
2023-12-31 17:11:52,11.06,-1.11,9.95,8V1DCCMP0104HDHCTCJRKZ7HTG,"water_bottle,aw_root_beer,lance_toast_chee_cra...",10%,Payment,2023,12,31,...,0,0,3,0,0,1,1,0,0,5
2023-12-31 18:41:21,59.98,0.00,59.98,4FH2YZRSWS1P7F2Y9CP65AA37R,2_x_spots,no_discount,Payment,2023,12,31,...,0,2,0,0,0,0,0,0,0,2
