In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

In [12]:
import numpy as np
import pandas as pd
import gzip
from src.utils.logger import logger

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

In [4]:
def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
        if i % 10000 == 0:
            logger.info('Rows processed: {:,}'.format(i))
    return pd.DataFrame.from_dict(df, orient='index')

In [37]:
df = get_df('../data/meta_Electronics.json.gz')

2019-12-05 12:46:19,889 - Rows processed: 10,000
2019-12-05 12:46:20,813 - Rows processed: 20,000
2019-12-05 12:46:25,294 - Rows processed: 30,000
2019-12-05 12:46:26,184 - Rows processed: 40,000
2019-12-05 12:46:27,042 - Rows processed: 50,000
2019-12-05 12:46:27,872 - Rows processed: 60,000
2019-12-05 12:46:28,698 - Rows processed: 70,000
2019-12-05 12:46:29,525 - Rows processed: 80,000
2019-12-05 12:46:30,356 - Rows processed: 90,000
2019-12-05 12:46:31,232 - Rows processed: 100,000
2019-12-05 12:46:32,036 - Rows processed: 110,000
2019-12-05 12:46:32,868 - Rows processed: 120,000
2019-12-05 12:46:33,685 - Rows processed: 130,000
2019-12-05 12:46:34,569 - Rows processed: 140,000
2019-12-05 12:46:35,471 - Rows processed: 150,000
2019-12-05 12:46:36,353 - Rows processed: 160,000
2019-12-05 12:46:37,170 - Rows processed: 170,000
2019-12-05 12:46:38,070 - Rows processed: 180,000
2019-12-05 12:46:38,889 - Rows processed: 190,000
2019-12-05 12:46:39,705 - Rows processed: 200,000
2019-12-0

In [13]:
from pandas.api.types import is_object_dtype

# Lowercase Functions
def lowercase_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Lowercase characters from all columns in a dataframe.

    Args:
        df: Pandas dataframe

    Returns:
        Lowercased dataframe
    """
    df = df.copy()
    for col in df.columns:
        if is_object_dtype(df[col]):
            df = lowercase_cols(df, [col])
    return df


def lowercase_cols(df: pd.DataFrame, colnames) -> pd.DataFrame:
    """
    Lowercase characters from specified columns in a dataframe

    Args:
        df: Pandas dataframe
        colnames: Names of columns to be lowercased

    Returns: Lowercased dataframe

    """
    df = df.copy()
    for col in colnames:
        assert df[col].dtype != np.float64 and df[col].dtype != np.int64, \
            'Trying to lowercase a non-string column: {}'.format(col)
        df[col] = df[col].str.lower()
    return df

In [38]:
df.head()

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand
0,132793040,http://ecx.images-amazon.com/images/I/31JIPhp%...,The Kelby Training DVD Mastering Blend Modes i...,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Mastering Blend Modes in A...,,,,
1,321732944,http://ecx.images-amazon.com/images/I/31uogm6Y...,,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Adobe Photoshop CS5 Crash ...,,,,
2,439886341,http://ecx.images-amazon.com/images/I/51k0qa8f...,Digital Organizer and Messenger,"[[Electronics, Computers & Accessories, PDAs, ...",Digital Organizer and Messenger,8.15,{'Electronics': 144944},"{'also_viewed': ['0545016266', 'B009ECM8QY', '...",
3,511189877,http://ecx.images-amazon.com/images/I/41HaAhbv...,The CLIKR-5 UR5U-8780L remote control is desig...,"[[Electronics, Accessories & Supplies, Audio &...",CLIKR-5 Time Warner Cable Remote Control UR5U-...,23.36,,"{'also_viewed': ['B001KC08A4', 'B00KUL8O0W', '...",
4,528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[[Electronics, GPS & Navigation, Vehicle GPS, ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",


In [39]:
df['related'] = df['related'].astype(str)
df['categories'] = df['categories'].astype(str)
df['salesRank'] = df['salesRank'].astype(str)

In [34]:
sum(df['related'].isnull())

0

In [40]:
df = lowercase_df(df)

In [41]:
df

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand
0,0132793040,http://ecx.images-amazon.com/images/i/31jiphp%...,the kelby training dvd mastering blend modes i...,"[['electronics', 'computers & accessories', 'c...",kelby training dvd: mastering blend modes in a...,,,,
1,0321732944,http://ecx.images-amazon.com/images/i/31uogm6y...,,"[['electronics', 'computers & accessories', 'c...",kelby training dvd: adobe photoshop cs5 crash ...,,,,
2,0439886341,http://ecx.images-amazon.com/images/i/51k0qa8f...,digital organizer and messenger,"[['electronics', 'computers & accessories', 'p...",digital organizer and messenger,8.15,{'electronics': 144944},"{'also_viewed': ['0545016266', 'b009ecm8qy', '...",
3,0511189877,http://ecx.images-amazon.com/images/i/41haahbv...,the clikr-5 ur5u-8780l remote control is desig...,"[['electronics', 'accessories & supplies', 'au...",clikr-5 time warner cable remote control ur5u-...,23.36,,"{'also_viewed': ['b001kc08a4', 'b00kul8o0w', '...",
4,0528881469,http://ecx.images-amazon.com/images/i/51fnrkjq...,"like its award-winning predecessor, the intell...","[['electronics', 'gps & navigation', 'vehicle ...",rand mcnally 528881469 7-inch intelliroute tnd...,299.99,,"{'also_viewed': ['b006zoi9oy', 'b00c7fkt2a', '...",
...,...,...,...,...,...,...,...,...,...
498191,bt008v9j9u,http://ecx.images-amazon.com/images/i/313e6sjm...,vehicle suction cup mount (replacement) notice...,"[['electronics', 'gps & navigation', 'gps syst...",suction cup mount,21.99,,{'buy_after_viewing': ['b000epfcc2']},garmin
498192,bt008sxq4c,http://ecx.images-amazon.com/images/i/31of9onv...,quatech - 1 port pcmcia to db-25 parallel adap...,"[['electronics', 'computers & accessories', 'c...",parallel pcmcia card 1port epp,23.99,,"{'also_bought': ['b000sr2h4w', 'b001q7x0w6'], ...",
498193,bt008g3w52,http://ecx.images-amazon.com/images/i/21wirx5f...,c2g - 5m ultma usb 2.0 a mini b cble,"[['electronics', 'computers & accessories', 'c...",c2g / cables to go 5m ultima usb 2.0 cable,18.91,,"{'bought_together': ['b0002d6qjo'], 'buy_after...",c2g
498194,bt008uktmw,http://ecx.images-amazon.com/images/i/41tnavmf...,keyboard drawer.,"[['electronics', 'computers & accessories', 'c...",underdesk keyboard drawer,25.54,,"{'also_viewed': ['b0002ld0zy', 'b0002lczp0', '...",fellowes
