# Goals

1) Aquire transaction data from a local game store for the years 2021-23 <br>
2) Prepare data for use in future projects <br>
3) Document preperation process <br>

In [1]:
# Imports

# libraries
import pandas as pd
import regex as re

import os

# writen files
import cata_lists.accessories as a
import cata_lists.board_games as b
import cata_lists.concessions as c
import cata_lists.paint_supplies as p
import cata_lists.rpg as r
import cata_lists.table_minis as m
import cata_lists.tcg as t
import cata_lists.other as o



# Read in Data and Create Dataframe

In [2]:
# combine annual csv files into one dataframe

df_2021 = pd.read_csv('raw_data/2021-2022.csv').sort_values('Date')
df_2022 = pd.read_csv('raw_data/2022-2023.csv').sort_values('Date')
df_2023 = pd.read_csv('raw_data/2023-2024.csv').sort_values('Date')


df = pd.concat([df_2021, df_2022, df_2023]).sort_values('Date')

Merged dataframe contains:
* 19,721 rows, each representing a transaction occuring between 2021 and 2023
* 51 columns providing details about those transactions

# Drop Irrelevant Colunms
Future projects will focus on finding patterns in customer purchasing behavior, so I will restrict the data frame only to coluns that are relevant to that purpose

In [3]:
# remake df with only relevant columns

df = df[['Date',
         'Time',
         'Gross Sales',
         'Discounts',
         'Net Sales',
         'Transaction ID',
         'Customer ID', 
         'Description', 
         'Discount Name',
         'Event Type']]

# Rename Columns
Columns were renamed for clarity ease of use

In [4]:
df = df.rename(columns = {'Date' : 'date',
                          'Time' : 'time',
                          'Gross Sales' : 'gross_sales',
                          'Discounts': 'discount_amount',
                          'Net Sales' : 'net_sales',
                          'Transaction ID' : 'trans_id',
                          'Customer ID' : 'cust_id', 
                          'Description' : 'cart', 
                          'Discount Name' : 'discount_type',
                          'Event Type' : 'event_type'})

df.columns

Index(['date', 'time', 'gross_sales', 'discount_amount', 'net_sales',
       'trans_id', 'cust_id', 'cart', 'discount_type', 'event_type'],
      dtype='object')

After dropping and renaming data contains the following columns
* date
* time
* gross_sales
* discount_amount
* net_sales
* trans_id, id number for transaction
* cust_id, id number for customer making purchase
* cart, string containing names of items bought
* discount_type, type of discount applied to purchase
* event_type, type of transaction

# Handle Null Values, and Check Column Data Types
<br>
* 3445 nulls in cust_id were imputed with 'unknown'<br>
* 10236 nulls in discount_type were imputed with 'No Discount'<br>
* 3 rows were dropped that contained null values in cart and no other useful data
* Cleaned columns containing dollor amounts and converted them to float

In [5]:
# handle null values
df.cust_id = df.cust_id.fillna('unknown')

df.discount_type = df.discount_type.fillna('no_discount')

df = df.dropna(subset=['cart'])

# convert dollors from string to float
df['net_sales'] = df['net_sales'].str.replace('$', '').str.replace(',', '').astype(float)
df['gross_sales'] = df['gross_sales'].str.replace('$', '').str.replace(',', '').astype(float)
df['discount_amount'] = df['discount_amount'].str.replace('$', '').str.replace(',', '').astype(float)

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


# Set Datetime as Index and add Time Derivative Columns

In [6]:
# create datetime column and set it as the index
df['datetime'] = df.date + ' ' + df.time
df['datetime'] = pd.to_datetime(df['datetime'])

df = df.set_index('datetime').sort_index()

df = df.drop(columns = ['date', 'time'])

# get time derivative columns
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['weekday'] = df.index.day_name()  

# Get list of All Unique Items Appearing in Cart
<br>
Clean values in cart<br>
* Convert to lowercase<br>
* Remove punctuation<br>
* Reduce wordiness<br>
* Remove space before and after text





In [7]:
# clean values in cart to remove extra punctuation and wordiness

def clean_text_in_cart(value):
    
    value = (value.lower()
                  .replace('(regular)', '')
                  .replace('  - too much caffeine', '')
                  .replace('  - carbonated beverage', ''))
                  
    value = re.sub(r'[^a-z0-9\s,]', '' , value)
    
    return value
    

df['cart'] = df['cart'].apply(clean_text_in_cart)


# Get string of all values in cart seperated by commas

items = ''

for value in df['cart']:
    
    items += ',' + value
    
# get master list by splitting the string on comma and stripping the resulting values
master_list = list(set([re.sub(r'\d+ x ', '', item).strip().replace(' ', '_') for item in items.split(',') if item != '']))

master_list.sort()


for item in master_list:
    
    print(item)

000_60010199131
000__death_guard_plagueburst_crawler
000__death_guard_typhus
000_adeptus_astartes_mark_3_space_marines
000_adeptus_mechanicus_skitarii___5910
000_assault_intercessors__paints_set___6011
000_astra_militarum_cadian_command_squad
000_astra_militarum_cadian_heavy_weapons_squad___4719
000_astra_militarum_cadian_infantry_squad
000_astra_militarum_sentinel
000_chaos_space_marines
000_chaos_space_marines_dark_apostle
000_chaos_space_marines_raptors___4313
000_codex_adeptus_mechanicus
000_codex_astra_militarum
000_codex_chaos_space_marines
000_codex_necrons
000_codex_space_marines
000_codex_tau_empire
000_core_book
000_death_guard__myphitic_blighthauler
000_death_guard_deathshroud_bodyguard
000_death_guard_plagueburst_crawler
000_drukhari_kabalite_warriors___4507
000_elite_edition_starter_set
000_genestealer_cults_atalan_jackals___5162
000_imperial_knights_knight_preceptor_canis_rex
000_indomitus_playing_cards
000_killdakka_warband
000_necrons_immortals
000_necrons_psychomancer


dice_throne_season_1_monk_v_paladin
dice_throne_season_1_pyromancer_v_shadow_thief
dice_throne_season_1_treant_v_ninja
dice_throne_season_2
dice_throne_season_2_cursed_pirate_v_artificer
dice_throne_season_2_gunslinger
dice_throne_season_2_gunslinger_v_samurai
dice_throne_season_2_tactician_v_huntress
dice_throne_season_one_monk_v_paladin
dice_throne_season_one_pyromancer_v_shadow_thief
dice_throne_seraph_v_vampire_lord
dice_throne_vampire
dice_tray_raven
diedra_dark_willow
diet_coke
diet_dr_pepper
dig_deeper
digimon_ancient_dragon_starter_deck
digimon_battle_of_omni_booster_pack
digimon_card_game_great_legend_booster_pack
digimon_card_gamegreat_legend
digimon_classic_collection
digimon_double_diamond
digimon_new_awakening
digimon_parallel_world_deck
digimon_starter_deck_gallantmon
digimon_starter_deck_ulforce_veedramon
digimonnext_adventure
ding_dong
dingo_halfling_rogue
dinosaur_island
dinosaur_world
dire_bear_77494
dire_boar_77672
dire_crocodile_77670
disney_lorcana_booster_pack_dis

hagakure
hagar
hajad
hakon
half_elf_rogue_77753
half_orc_paladin
halfling_cooks_30044
halfling_rogue
halfling_scout_03526
hamlet_kickstarter
hanei
hangmans_gibbet_77619
happy_little_dinosaurs
happy_little_dinosaurs_56_player_expansion
happy_little_dinosaurs_dating_disasters
happy_little_dinosaurs_perils_of_puberty
harmonize_playmat
harry_potter_clue
harry_potter_jenga
harry_potter_munchkin
harry_potter_munchkin_deluxe
harry_potter_scrabble
healers_balm_dice
heat
heavy_metal_realmspace_rpg_dice_set
hell_cat_77327
hell_hound_77038
hellbore_the_assassin_02782
hellborn_rogue
hellborn_wizard_77149
here_to_slay
here_to_slay_berserker_exansion
here_to_slay_druids_expansion
heroes_of_dominaria_board_game
heroes_of_dominaria_premium_edition
heroic_cleric
heroic_dice_of_metallic_luster_black_with_red_font_7pc
heroic_dice_of_metallic_luster_black_with_silver_font_7pc
heroic_dice_of_metallic_luster_blue_and_gold_font_7pc
heroic_dice_of_metallic_luster_multicolor_7pc
heroic_dice_of_metallic_luster_

pathfinder_gamemastery_npc_pawn_collection
pathfinder_gm_screen___pzo2201
pathfinder_goblin_pyros_89002
pathfinder_goblin_warriors_89003
pathfinder_miniatures_gnome_male_bard
pathfinder_red_dragon_89001
pathfinder_the_fall_of_plaguestone___pzo9555
pathfinder_the_slithering___pzo9557
pathfinner_battles_deep_cuts_male_human_wizard
peace_tea_caddy_shack
peace_tea_just_peachy
peace_tea_razzleberry
peanut_mms
pepsi
petty_officer
phantom_premonition_commander_deck
phase_10
phyrexia_all_will_be_one_collector_booster_pack
phyrexia_all_will_be_one_collector_box
phyrexia_all_will_be_one_draft_booster_box
phyrexia_all_will_be_one_draft_booster_pack
phyrexia_all_will_be_one_set_booster_box
phyrexia_all_will_be_one_set_booster_pack
phyrexia_commander_corrupting_influence
phyrexia_prerelease
phyrexia_prerelease_kit
physician_80065
pictionary
pidlwick_ii
pig_cart_77657
pikachu_v_shining_fates
pikachu_vunion
pink_11_pcs_metal_dice
pink_7_pcs_metal_dice_set
pink_cats_eye_stone_dice
pink_cosmos_glass
pi

up_mtg_playmat_v9_19329
up_sleeves_black_aw2141_aw2279
upzone_ancient_zone
upzone_catherdral_zone
upzone_dungeon_zone
upzone_obsidian_zone
urban_structures_flashpoint_expansion
urshifu_v_box
ursula_dwarven_bear_rider_77353
ursula_silverbraid_03293
vagrantsong
valfuryx_77683
valley_of_the_dead_king
valor_and_villainy_deluxe_edition
vampire_bloodlords_07081
vampire_bloodlords_2_77727
vampire_rivals
vampire_the_masquerade_core_rulebook
vampire_the_masquerade_players_guide
vampire_the_masquerade_rivals_blood_and_alchemy
vampire_the_masquerade_rivals_shadows_and_shrouds
vampire_the_masquerade_rivals_the_wolf_and_the_rat
van_richtens_guide_to_ravenloft
vandora_waverunner
vandorendra
vanja
vashtorr
vbattle_deck_venusaur_vs_blastoise
velvet_folding_dice_tower_black
velvet_folding_dice_tower_purple
velvet_folding_dice_tray_green
velvet_folding_dice_tray_purple
velvet_folding_dice_tray_with_leather_backing_black___533
velvet_folding_dice_tray_with_leather_backing_blue___532
velvet_folding_dice_t

In [8]:
master_list

['000_60010199131',
 '000__death_guard_plagueburst_crawler',
 '000__death_guard_typhus',
 '000_adeptus_astartes_mark_3_space_marines',
 '000_adeptus_mechanicus_skitarii___5910',
 '000_assault_intercessors__paints_set___6011',
 '000_astra_militarum_cadian_command_squad',
 '000_astra_militarum_cadian_heavy_weapons_squad___4719',
 '000_astra_militarum_cadian_infantry_squad',
 '000_astra_militarum_sentinel',
 '000_chaos_space_marines',
 '000_chaos_space_marines_dark_apostle',
 '000_chaos_space_marines_raptors___4313',
 '000_codex_adeptus_mechanicus',
 '000_codex_astra_militarum',
 '000_codex_chaos_space_marines',
 '000_codex_necrons',
 '000_codex_space_marines',
 '000_codex_tau_empire',
 '000_core_book',
 '000_death_guard__myphitic_blighthauler',
 '000_death_guard_deathshroud_bodyguard',
 '000_death_guard_plagueburst_crawler',
 '000_drukhari_kabalite_warriors___4507',
 '000_elite_edition_starter_set',
 '000_genestealer_cults_atalan_jackals___5162',
 '000_imperial_knights_knight_preceptor_c

# Add Item Count Columns

Add a column for each item in unique items list showing the number of that item involved in each transaction

In [9]:
# def get_number_of_items(value, item):
    
#     pattern = rf"((\d+ x )?\b{re.escape(item)}\b)"

#     matches = re.findall(pattern, value)

#     total = sum([int(match[0][0]) if match[0][0].isdigit() == True else 1 for match in matches ])

#     return total 

# for item in master_list:
    
#     df[item] = df.cart.apply(get_number_of_items, args=(item,))

In [10]:
df = pd.read_csv('prepared_store_data.csv')

In [11]:
dfcart

NameError: name 'dfcart' is not defined

In [None]:
value = '2 x this item, 6 x this item, 5 x this other item, this one item, this item'
item = 'this item'

pattern = rf"((\d+ x )?\b{re.escape(item)}\b)"

matches = re.findall(pattern, value)

matches = sum([int(match[0][0]) if match[0][0].isdigit() == True else 1 for match in matches ])

matches

In [None]:
# seperate item count columns by category into differint dataframes
df_acc = df[a.accessory_list]
df_bg = df[b.board_game_list]
df_con = df[c.concessions_list]
df_ps = df[p.paint_supplies_list]
df_rpg = df[r.rpg_list]
df_tm = df[m.table_minis_list]
df_tcg = df[t.tcg_list]
df_other = df[o.other_list]
df_room = df[o.game_room_list]
df_master = df[master_list]

# get total for each 
df['accessories'] = df_acc.sum(axis=1)
df['board_games'] = df_bg.sum(axis=1)
df['concessions'] = df_con.sum(axis=1)
df['modeling_supplies'] = df_ps.sum(axis=1)
df['role_playing_games'] = df_rpg.sum(axis=1)
df['minis_models'] = df_tm.sum(axis=1)
df['trading_card_games'] = df_tcg.sum(axis=1)
df['other'] = df_other.sum(axis=1)
df['game_room_rental'] = df_room.sum(axis=1)
df['all_items'] = df_master.sum(axis=1)


In [None]:
df.to_csv('prepared_store_data.csv', index_label=False)