In [129]:
#libraries
import pandas as pd
import numpy as np
import operator
from functools import reduce
import re
pd.set_option('display.max_rows', 500)

In [130]:
amazon_df = pd.read_csv("AmazonBooksData.csv")  # load data

In [131]:
amazon_df.head()  # display data

Unnamed: 0,priceFromates,prices,rating,title
0,\n \n \n \n Kindle...,"$,0,.,00,$,11,.,99,$,0,.,00",4.3 out of 5 stars,Last Day
1,\n \n \n \n Hardco...,"$,28,.,95,$,19,.,99",4.7 out of 5 stars,The Last Winter of the Weimar Republic: The Ri...
2,\n \n \n \n Paperb...,"$,10,.,69,$,3,.,99,$,24,.,99",4.6 out of 5 stars,"You Can Draw in 30 Days: The Fun, Easy Way to ..."
3,\n \n \n \n Hardco...,"$,26,.,33",4.8 out of 5 stars,The Last Kids on Earth: The Monster Box (books...
4,\n \n \n \n Hardco...,"$,16,.,22,$,0,.,00",4.6 out of 5 stars,The Whole30: The 30-Day Guide to Total Health ...


In [132]:
# clean prices column and adjust them in following formate dollar.cents
def cleanPrice(prices):
    if pd.isna(prices):
        prices = "0"    # set Nan values to 0
    splitPrices  = prices.split(",")
    splitPrices = [x for x in splitPrices if x != "$"]    # remove "$" from prices
    newPrices = []
    for i in range(0, len(splitPrices),3):
        j = i+3
        newPrices.append(["".join(splitPrices[i:j])])   # adjust prices in following formate dollar.cents
    return newPrices
    

In [133]:
amazon_df["prices"] = amazon_df["prices"].apply(cleanPrice)
amazon_df["prices"]

0        [[0.00], [11.99], [0.00]]
1               [[28.95], [19.99]]
2       [[10.69], [3.99], [24.99]]
3                        [[26.33]]
4                [[16.22], [0.00]]
5        [[9.99], [12.99], [0.00]]
6         [[9.78], [8.99], [0.00]]
7                         [[5.00]]
8                        [[29.95]]
9                         [[9.99]]
10               [[0.00], [12.99]]
11        [[1.99], [2.30], [0.00]]
12                       [[58.70]]
13     [[14.79], [12.99], [18.45]]
14                        [[9.99]]
15      [[0.00], [14.99], [16.74]]
16              [[19.99], [26.90]]
17       [[2.99], [0.00], [14.99]]
18              [[14.29], [13.99]]
19                [[9.51], [0.00]]
20               [[0.00], [12.50]]
21      [[14.99], [17.49], [0.00]]
22                           [[0]]
23               [[16.39], [7.49]]
24        [[9.99], [0.00], [7.48]]
25        [[0.00], [7.99], [7.48]]
26                       [[41.44]]
27                           [[0]]
28               [[0

In [134]:
#clean PriceFromates
def cleanPriceFormate(priceFromates):
    if pd.isna(priceFromates):
        priceType = ""          # set Nan to ""
    priceFromates = priceFromates.replace('\n', '')  # remove new lines
    priceFromates = re.sub(' +', ' ', priceFromates)   # replace mutiple spaces with single space
    splitPriceFormate  = priceFromates.split(",")      # splite on comma
    return splitPriceFormate

In [135]:
amazon_df["priceFromates"] = amazon_df["priceFromates"].apply(cleanPriceFormate)
amazon_df["priceFromates"]

0           [ Kindle ,  Paperback ,  Audible Audiobook ]
1                                [ Hardcover ,  Kindle ]
2                [ Paperback ,  Kindle ,  Spiral-bound ]
3                                          [ Hardcover ]
4                   [ Hardcover ,  Kindle ,  Paperback ]
5           [ Kindle ,  Paperback ,  Audible Audiobook ]
6           [ Hardcover ,  Kindle ,  Audible Audiobook ]
7                                     [ Kindle Edition ]
8                                          [ Hardcover ]
9                                [ Paperback ,  Kindle ]
10          [ Paperback ,  Audible Audiobook ,  Kindle ]
11          [ Kindle ,  Paperback ,  Audible Audiobook ]
12                                         [ Paperback ]
13               [ Paperback ,  Kindle ,  Spiral-bound ]
14                 [ Flexibound ,  Kindle ,  Paperback ]
15          [ Audible Audiobook ,  Kindle ,  Hardcover ]
16                            [ Paperback ,  Hardcover ]
17             [ Kindle ,  Audi

In [136]:
#clean Ratings
def cleanRating(rating):
    if pd.isna(rating):
        rating = "0"    # replace Nan with 0
    rating = rating.split()[0]   # extract rating 
    return rating

In [137]:
amazon_df["rating"] = amazon_df["rating"].apply(cleanRating)
amazon_df["rating"]

0      4.3
1      4.7
2      4.6
3      4.8
4      4.6
5      4.6
6      4.9
7      5.0
8      4.3
9      4.6
10     4.7
11     4.6
12     4.2
13     4.7
14     4.5
15     4.6
16     3.7
17     4.4
18     4.8
19     4.6
20     5.0
21     4.2
22     4.4
23     4.9
24     4.5
25     4.3
26     4.2
27     4.5
28     4.6
29     4.9
30     4.6
31     4.6
32     4.7
33     4.3
34     4.8
35     4.1
36     4.0
37     4.8
38     4.5
39     4.8
40     4.9
41     4.4
42     4.7
43     4.2
44     4.4
45     4.8
46     4.6
47     4.4
48     4.6
49     4.8
50     4.6
51     4.5
52     4.6
53     4.9
54     4.4
55     4.4
56     4.9
57     4.2
58     4.5
59     4.1
60     4.7
61     4.6
62     4.6
63     4.5
64     4.8
65     4.2
66     4.7
67     4.5
68     4.8
69     4.7
70     3.8
71     4.5
72     4.7
73     4.5
74     4.7
75     4.6
76     4.4
77     4.6
78     4.4
79     4.4
80     5.0
81     4.7
82     4.0
83     4.8
84     4.4
85     4.9
86     4.3
87     4.8
88     4.4
89     4.3
90     4.5

In [138]:
# clean extra price formate tags
def cleanPriceAndPriceFormate(price, priceFormate):
    noOfPrices = len(price)
    priceFormate = priceFormate[0:noOfPrices]
    return priceFormate

In [139]:
amazon_df["priceFromates"] =  amazon_df[['prices','priceFromates']].apply(lambda x: setPriceAndPriceFormate(*x), axis=1)

In [140]:
amazon_df[amazon_df['priceFromates'].map(lambda d: d != [""])]    # drop rown with price fromate = 0

Unnamed: 0,priceFromates,prices,rating,title
0,"[ Kindle , Paperback , Audible Audiobook ]","[[0.00], [11.99], [0.00]]",4.3,Last Day
1,"[ Hardcover , Kindle ]","[[28.95], [19.99]]",4.7,The Last Winter of the Weimar Republic: The Ri...
2,"[ Paperback , Kindle , Spiral-bound ]","[[10.69], [3.99], [24.99]]",4.6,"You Can Draw in 30 Days: The Fun, Easy Way to ..."
3,[ Hardcover ],[[26.33]],4.8,The Last Kids on Earth: The Monster Box (books...
4,"[ Hardcover , Kindle ]","[[16.22], [0.00]]",4.6,The Whole30: The 30-Day Guide to Total Health ...
5,"[ Kindle , Paperback , Audible Audiobook ]","[[9.99], [12.99], [0.00]]",4.6,One Little Lie (The Pelican Harbor Series Book 1)
6,"[ Hardcover , Kindle , Audible Audiobook ]","[[9.78], [8.99], [0.00]]",4.9,The Last Kids on Earth and the Midnight Blade
7,[ Kindle Edition ],[[5.00]],5.0,The List: From Slavery to George Floyd (Colori...
8,[ Hardcover ],[[29.95]],4.3,The 30-day Diabetes Cure (Featuring the Diabet...
9,[ Paperback ],[[9.99]],4.6,The 30-Day Ketogenic Cleanse: Reset Your Metab...


In [141]:
# get distint values of Price Formate to create new Columns
distintPriceFromates = list(set(reduce(operator.concat, amazon_df["priceFromates"])))
distintPriceFromates

[' Board book ',
 ' Plastic Comb ',
 ' Cards ',
 ' Sheet music ',
 ' Imitation Leather ',
 ' Product Bundle ',
 ' Novelty Book ',
 ' Loose Leaf ',
 ' MP3 CD ',
 ' Paperback ',
 ' Diary ',
 ' Kindle ',
 ' eTextbook ',
 ' Mass Market Paperback ',
 ' Spiral-bound ',
 ' Map ',
 ' Audio CD ',
 ' Prime Video ',
 ' Audible Audiobook ',
 ' Flexibound ',
 ' Printed Access Code ',
 ' Hardcover ',
 ' Kindle & comiXology ',
 ' Calendar ',
 ' Kindle Edition ']

In [142]:
#Create new columns for each Price Formate
for formate in distintPriceFromates:
    amazon_df[str(formate)] = ""

In [143]:
#Fill the price under related Price formate and None in rest
def fillPricesFromates(price, priceTypes , formateName ):
    
    for i, ptype in enumerate(priceTypes):
        if ptype == formateName:
            return price[i]
        

In [144]:
# Call fillPricesFromates for each distinct Price Formate for all rows
for formate in distintFormateStandards:
    amazon_df[str(formate)] = amazon_df[['prices','priceFromates']].apply(lambda x: fillPricesFromates(*x, formate), axis=1)

In [145]:
amazon_df # final data frame

Unnamed: 0,priceFromates,prices,rating,title,Board book,Plastic Comb,Cards,Sheet music,Imitation Leather,Product Bundle,...,Flexibound,Printed Access Code,Hardcover,Kindle & comiXology,Calendar,Kindle Edition,Single Issue Magazine,Library Binding,Kindle Edition with Audio/Video,Comics
0,"[ Kindle , Paperback , Audible Audiobook ]","[[0.00], [11.99], [0.00]]",4.3,Last Day,,,,,,,...,,,,,,,,,,
1,"[ Hardcover , Kindle ]","[[28.95], [19.99]]",4.7,The Last Winter of the Weimar Republic: The Ri...,,,,,,,...,,,[28.95],,,,,,,
2,"[ Paperback , Kindle , Spiral-bound ]","[[10.69], [3.99], [24.99]]",4.6,"You Can Draw in 30 Days: The Fun, Easy Way to ...",,,,,,,...,,,,,,,,,,
3,[ Hardcover ],[[26.33]],4.8,The Last Kids on Earth: The Monster Box (books...,,,,,,,...,,,[26.33],,,,,,,
4,"[ Hardcover , Kindle ]","[[16.22], [0.00]]",4.6,The Whole30: The 30-Day Guide to Total Health ...,,,,,,,...,,,[16.22],,,,,,,
5,"[ Kindle , Paperback , Audible Audiobook ]","[[9.99], [12.99], [0.00]]",4.6,One Little Lie (The Pelican Harbor Series Book 1),,,,,,,...,,,,,,,,,,
6,"[ Hardcover , Kindle , Audible Audiobook ]","[[9.78], [8.99], [0.00]]",4.9,The Last Kids on Earth and the Midnight Blade,,,,,,,...,,,[9.78],,,,,,,
7,[ Kindle Edition ],[[5.00]],5.0,The List: From Slavery to George Floyd (Colori...,,,,,,,...,,,,,,[5.00],,,,
8,[ Hardcover ],[[29.95]],4.3,The 30-day Diabetes Cure (Featuring the Diabet...,,,,,,,...,,,[29.95],,,,,,,
9,[ Paperback ],[[9.99]],4.6,The 30-Day Ketogenic Cleanse: Reset Your Metab...,,,,,,,...,,,,,,,,,,
