# Task 1: Building a Transaction Database in Google Big Query


## In this task you’ll upload all Wedge transaction records to Google Big Query. You’ll want to make sure that the column data types are correctly specified and you’ve properly handled the null values. 


Note: this assignment can be done manually or programmatically. Naturally I’d prefer it be done programmatically so that you get more practice, but that’s not required to get full credit. 


In [None]:
#import packages and libraries
import os
import re
import io
import datetime
import sqlite3
import csv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_gbq
import janitor
import pprint

import zipfile
from zipfile import ZipFile

# Google Big Query 
from google.cloud import bigquery
from google.oauth2 import service_account

In [None]:
#Set Paths for GBQ
service_path = "/Users/meganalbee/Desktop/ADA/key/"
service_file = 'albee-msba-4037d70faf07.json' # My Key   
gbq_proj_id = 'albee-msba' # My GBQ 
dataset_id = 'wedge_msba' #Set to the Wedge

#Private Key. Do not change. 
private_key = service_path + service_file

In [None]:
# Pass in our credentials so that Python has permission to access our project
credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

In [None]:
# Establish our connection
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

## Check dataset and proj in GBQ

In [None]:
for item in client.list_datasets() : 
    print(item.full_dataset_id)

In [None]:
tables = client.list_tables(dataset_id)  

for table in tables:
    if table :
        print(table.table_id)

In [None]:
file_pattern = re.compile(r"(\D{12})")

In [None]:
#This will delete out the dataset for every table that matches
for table in client.list_tables(dataset_id) :
    if file_pattern.search(table.table_id) :
        table_id = ".".join([gbq_proj_id,dataset_id,table.table_id])
        client.delete_table(table_id, not_found_ok=True)
        
        print(f"Deleted {table.table_id}.")

## Extract, Clean and Upload Files

In [None]:
#Setting Headers. Based on Previous EDA. 
headers = ['"datetime"', '"register_no"', '"emp_no"', '"trans_no"', '"upc"', '"description"', '"trans_type"', '"trans_subtype"', '"trans_status"', '"department"', '"quantity"', '"Scale"', '"cost"', '"unitPrice"', '"total"', '"regPrice"', '"altPrice"', '"tax"', '"taxexempt"', '"foodstamp"', '"wicable"', '"discount"', '"memDiscount"', '"discountable"', '"discounttype"', '"voided"', '"percentDiscount"', '"ItemQtty"', '"volDiscType"', '"volume"', '"VolSpecial"', '"mixMatch"', '"matched"', '"memType"', '"staff"', '"numflag"', '"itemstatus"', '"tenderstatus"', '"charflag"', '"varflag"', '"batchHeaderID"', '"local"', '"organic"', '"display"', '"receipt"', '"card_no"', '"store"', '"branch"', '"match_id"', '"trans_id"']

In [None]:
#cleaning through headers 
#This will be the header rows for files with no headers
clean_headers = [i.replace('"', '') for i in headers]

In [None]:
#set os 
zip_files = os.listdir("WedgeZipOfZips/")

In [None]:
delimiters = dict()
obj_columns = []

for this_zf in zip_files : 
    with ZipFile("WedgeZipOfZips/" + this_zf, 'r') as zf :
        zipped_files_name = zf.namelist()
        
        for file_name in zipped_files_name :
            input_file = zf.open(file_name,'r')
            input_file = io.TextIOWrapper(input_file,encoding="utf-8")
            
            dialect = csv.Sniffer().sniff(sample=input_file.readline(),
                                      delimiters=[",",";","\t"])
            
            delimiters[file_name] = dialect.delimiter
            
            #if header, read in file. If no header, set the header to clean_headers
            for line in input_file :
                if line[0] == "datetime" :
                    df = pd.read_csv(input_file, sep = delimiters[file_name], encoding="utf-8")                
               
                else :
                    df = pd.read_csv(input_file, sep = delimiters[file_name], names = clean_headers, encoding="utf-8")                 
                
            

                df = janitor.clean_names(df)
                df['datetime'] = pd.to_datetime(df.datetime, format='%Y-%m-%d %H:%M:%S')
                df['department'] = df['department'].astype("str")
                df.department = df.department.fillna('')

                #These columns throw errors in GBQ, created a list an enumerated
                df["altprice"] = df["altprice"].astype(dtype = "string")
                df["itemstatus"] = df["itemstatus"].astype(dtype = "string")
                df["display"] = df["display"].astype(dtype = "string")
                df["local"] = df["local"].astype(dtype = "string")
                df["batchheaderid"] = df["batchheaderid"].astype(dtype = "string")
                df["match_id"] = df["match_id"].astype(dtype = "string")
                df["organic"] = df["organic"].astype(dtype = "string")
                df["percentdiscount"] = df["percentdiscount"].astype(dtype = "string")
                df["receipt"] = df["receipt"].astype(dtype = "string")
                df["matched"] = df["matched"].astype(dtype = "string")
                df["staff"] = df["staff"].astype(dtype = "string")
                df["scale"] = df["scale"].astype(dtype = "string")
                df["taxexempt"] = df["taxexempt"].astype(dtype = "string")
                df["branch"] = df["branch"].astype(dtype = "string")
                df["trans_id"] = df["trans_id"].astype(dtype = "string")
                df["memtype"] = df["memtype"].astype(dtype = "string")
                df["varflag"] = df["varflag"].astype(dtype = "string")
                df["wicable"] = df["wicable"].astype(dtype = "string")
                df["numflag"] = df["numflag"].astype(dtype = "string")
                df["voided"] = df["voided"].astype(dtype = "string")
                df["volume"] = df["volume"].astype(dtype = "string")
                df["tax"] = df["tax"].astype(dtype = "string")
                df["foodstamp"] = df["foodstamp"].astype(dtype = "string")
                df["tenderstatus"] = df["tenderstatus"].astype(dtype = "string")

                df['trans_status'] = df['trans_status'].astype(dtype = 'string')


            
                
 
                for idx, column in enumerate(df) :
                    if df[column].dtypes == "object" :        
                        df = df.astype({column: 'str'})
                    

        
               
                table_name = file_name.replace('.csv','')
                table_id = ".".join([gbq_proj_id,dataset_id,table_name])
                pandas_gbq.to_gbq(df, table_id, project_id=gbq_proj_id, if_exists="replace")
