Import Libraries

In [15]:
import requests
import re
from bs4 import BeautifulSoup
import json
import shutil
import gzip
import os
import wget
import tarfile
import numpy as np
import pandas as pd
from pandasql import sqldf
from time_converter import Time
import time
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")

We will be looking at the data quality of the receipts dataset

In [13]:
def cleanIdCpg(id_data, column=None):
    # create the dictionary into a list to only get the values
    id_list = list(id_data[f'{column}'].to_dict().values())
    cleanedList = []
    for i in range(len(id_list)):
        parsed_name = id_list[i]
        # remove commas colons and split at quotations
        parsed_names = re.findall("'([^']*)'", id_list[i])
        # these were reoccuring strings that i saw
        cleanedList.append(parsed_names[-1])

    return cleanedList

In [11]:
def cleanDateLog(id_data, column=None):
    id_list = list(id_data[f'{column}'].to_dict().values())
    times_list = []
    dates_list = []
    for i in range(len(id_list)):
        try: 
            
            date_parsed = id_list[i].replace('}', "").split(':')[1].replace(" ", "")
            date_converted = Time(int(date_parsed[:10]), 'posix').to('dt')
            format_date = "%m/%d/%Y"
            format_time = "%H:%M:%S"
            formated_date = date_converted.strftime(format_date)
            formated_time = date_converted.strftime(format_time)
            times_list.append(formated_time)
            dates_list.append(formated_date)
        except AttributeError:
            
            date_parsed = str(id_list[i])
            dates_list.append(date_parsed)
            
    return dates_list

In [7]:
def convertDates(df):
    new_df = df.copy()
    col_list = list(new_df.columns)
    dates_col = col_list[3:10]
    dates_col.remove(col_list[8])
#     print(dates_col)
    for dates in dates_col:
        df[f'{dates}'] = cleanDateLog(id_data=df, column=f"{dates}")
        
    return df


In [8]:
def cleanReceipt():
    
    receipt_df = pd.read_csv('csv_files/receipts.csv', index_col = 0)
    receipts_df = convertDates(receipt_df)
    receipts_df['_id'] = cleanIdCpg(receipts_df, column = '_id')
    receipts_df = receipts_df.drop('rewardsReceiptItemList', axis = 1)
    
    return receipts_df

In [18]:
def receiptDf():
    receipt_df = pd.read_csv('csv_files/receipts.csv', index_col = 0)
    receipts_df = convertDates(receipt_df)
    receipts_df['_id'] = cleanIdCpg(receipts_df, column = '_id')
    receipts_df = receipts_df.rename(columns = {'_id':'receipt_id'}, inplace = False)
    date_col = ['createDate','dateScanned','finishedDate','purchaseDate','purchaseDate']
    for col in date_col:
        receipts_df[f'{col}'] = pd.to_datetime(receipts_df[f'{col}'])

#     print(receipt_df)
#     receipts_df.to_csv('clean_data/receipts_dataset.csv')
    
    return receipts_df

## Data Quality for Dataset Receipts
We have a nested dictionary in rewards Receipt Items List
this can slow down data cleaning espescially when the receipt item does not have a 
corresponding id to properly show which users receipt item that was.

Purchased Date show NAT could cause an issue if we do want to ever do any time series
analysis on the data . 

With barcode and description not being represented this can cause issues because we would be 
missing out on valuable data that could shift how we move forward with our customers


In [20]:
df = pd.DataFrame(receiptDf())

#### Nested dictionary could be its own dataset 

In [23]:
df.rewardsReceiptItemList

0       [{'barcode': '4011', 'description': 'ITEM NOT ...
1       [{'barcode': '4011', 'description': 'ITEM NOT ...
2       [{'needsFetchReview': False, 'partnerItemId': ...
3       [{'barcode': '4011', 'description': 'ITEM NOT ...
4       [{'barcode': '4011', 'description': 'ITEM NOT ...
                              ...                        
1114    [{'barcode': 'B076FJ92M4', 'description': 'mue...
1115                                                  NaN
1116                                                  NaN
1117    [{'barcode': 'B076FJ92M4', 'description': 'mue...
1118                                                  NaN
Name: rewardsReceiptItemList, Length: 1119, dtype: object

#### NAT items lose on few days of data

In [28]:
df.purchaseDate.isna().value_counts()

False    671
True     448
Name: purchaseDate, dtype: int64

#### Here we can see the this 4011 barcode and description is considered NULL but has final

In [25]:
df.rewardsReceiptItemList[0]

"[{'barcode': '4011', 'description': 'ITEM NOT FOUND', 'finalPrice': '26.00', 'itemPrice': '26.00', 'needsFetchReview': False, 'partnerItemId': '1', 'preventTargetGapPoints': True, 'quantityPurchased': 5, 'userFlaggedBarcode': '4011', 'userFlaggedNewItem': True, 'userFlaggedPrice': '26.00', 'userFlaggedQuantity': 5}]"