In [5]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import os
import random

%matplotlib inline

from category_encoders.ordinal import OrdinalEncoder

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import requests

from bs4 import BeautifulSoup

## Scrape each page to a csv

In [6]:
def get_page(page):
    response = requests.get('https://scrap-me.herokuapp.com/items?page='+str(page))
    soup = BeautifulSoup(response.content, 'html.parser')
    soup_children = list(soup.children)
    purchase_table = soup.find_all('table', class_="table table-bordered table-hover text-center")[0]
    purchase_tags = purchase_table.find_all('tr', class_=False)
    
    purchase_tags_text = [tag.get_text().strip().split("\n") for tag in purchase_tags]
    purchase_df = (pd.DataFrame(purchase_tags_text[1:],
                                columns=purchase_tags_text[0])
                     .set_index('id'))
    
    return purchase_df

In [7]:
get_page(2)

Unnamed: 0_level_0,orderportalid,orderdate_gmt,designer,style,shipper,shiptypeid,userid,isvip,country,region,...,freereturn,issale,productid,brand,ddpsubcategory,storeid,countryoforigin,size,category_1stlevel,platform
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7f18fb09df95dd8a85f03af26b2c5515,207247,2018-01-29 08:10:38.470000+00:00,2454,2457,2,2,74522.0,VIP,12,4,...,1,No,70622,91,"Trousers, overalls, shorts",15,14,118,Clothing,app
bfdbb297daf897176bb5761c31a88206,64361,2018-01-09 06:40:50.366000+00:00,5378,5382,2,2,12041.0,Not VIP,12,4,...,1,Yes,5599,322,Footwear with outer soles of rubber or plastics,623,1,12,Shoes,web
15dbb4813d3c3d82de21965feefac1a3,176512,2018-01-23 20:30:19.400000+00:00,38631,38641,3,2,127863.0,Not VIP,1,1,...,1,Yes,42441,99,Other footwear,138,26,186,Shoes,web
30a4236a65c9f6db51fad278a992d366,263680,2018-02-08 07:02:38.616000+00:00,55895,55908,2,2,18488.0,Not VIP,5,3,...,1,Yes,62115,17,"Trousers, overalls, shorts",197,1,66,Clothing,web
74edca7bc7ccd144ca936294b56187f4,224604,2018-02-01 05:27:16.926000+00:00,25142,25150,5,2,164787.0,Not VIP,39,4,...,1,No,27216,146,N/D,1077,10,129,Boys Clothing,web
429539068f39c4f1c4633d0de441c3e0,210462,2018-01-29 19:20:03.523000+00:00,105225,105247,3,4,69014.0,Not VIP,7,1,...,0,Yes,120693,1489,N/D,932,8,223,Clothing,
57b6b27ab7a85be7b7f7f2dfd5300370,80639,2018-01-10 11:21:02.343000+00:00,4629,4633,2,2,66656.0,Not VIP,19,4,...,1,Yes,4804,36,"Handbags, whether or not with shoulder strap, ...",77,1,17,Bags,web
8d989c909482449a4b6607dd1c777931,55535,2018-01-08 16:00:15.366000+00:00,45262,45275,2,9,47448.0,Not VIP,36,3,...,1,Yes,49991,83,Footwear with outer soles of rubber or plastics,349,29,36,Shoes,web
e996fc7a3e18b3609edb1ecb1cdeb548,260040,2018-02-07 15:20:48.500000+00:00,103028,103050,2,2,92835.0,Not VIP,5,3,...,1,No,117976,344,Footwear with outer soles of rubber or plastics,1071,26,12,Shoes,web
239e8ae0a22e0a03bd652c87fb04f0cb,127262,2018-01-15 10:15:48.320000+00:00,20881,20888,2,2,100264.0,Not VIP,12,4,...,1,Yes,22420,125,Footwear with outer soles of rubber or plastics,17,1,194,Shoes,web


In [15]:
#for page in range(1,401):
#    get_page(page).to_csv(os.path.join('web_data','page_'+str(page)+'.csv'))

## Columns in website

In [8]:
for i in get_page(1).columns:
    print(i)

orderportalid
orderdate_gmt
designer
style
shipper
shiptypeid
userid
isvip
country
region
ddprate
countrycode
hasusedwishlist
isreseller
hasitemsonbag
tierafterorder
tierbeforeorder
isusingmultipledevices
userfraudstatus
promocode
freereturn
issale
productid
brand
ddpsubcategory
storeid
countryoforigin
size
category_1stlevel
platform


## Join csv files

In [9]:
import glob

In [10]:
extension = 'csv'
all_filenames = [i for i in glob.glob('web_data/*.{}'.format(extension))]

In [11]:
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])

In [23]:
combined_csv = combined_csv.set_index('id')
combined_csv.to_csv("web_data/web_data.csv", index='id')

In [25]:
combined_csv

Unnamed: 0_level_0,orderportalid,orderdate_gmt,designer,style,shipper,shiptypeid,userid,isvip,country,region,...,freereturn,issale,productid,brand,ddpsubcategory,storeid,countryoforigin,size,category_1stlevel,platform
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
663ed2d3cd3880ca59f2d48341661e66,370748,2018-01-08 21:51:19.776000+00:00,28872,28880,2,2,123884.0,VIP New,5,3,...,1,Yes,31400,137,Jackets and blazers,47,1,42,Clothing,app
e8609464e1813ad2494416cb12676159,378254,2018-01-04 17:44:29.660000+00:00,59442,59456,1,3,104311.0,Not VIP,4,3,...,1,Yes,66229,82,"Coats: overcoats, raincoat, cape, cloaks and s...",11,16,9,Clothing,web
ccacb872e833031d124beb4e0a5be380,385215,2018-01-06 02:48:20.050000+00:00,13559,13566,2,2,260326.0,Not VIP,2,1,...,1,No,183805,1235,Sunglasses,357,1,1,Accessories,app
4a3050ae2c77da4f9c90e2e58e8e520f,404424,2018-01-01 18:27:31.266000+00:00,1812,1815,2,2,269620.0,Not VIP,14,3,...,1,No,1860,70,Footwear with outer soles of rubber or plastics,46,26,297,Shoes,web
21fa403d5d1110228f6ab64520747ea5,52058,2018-01-08 13:13:57.566000+00:00,154192,154224,2,2,44759.0,Not VIP,3,2,...,1,Yes,181482,1195,"Of base metal, whether or not plated with prec...",358,14,17,Jewellery,web
74d88df8fc4b75be9086a35ef4daba1c,370985,2018-01-14 19:44:27.830000+00:00,113318,113344,2,2,253135.0,Not VIP,9,3,...,1,No,17458,155,,38,20,42,Clothing,web
57427bc82ab11f683a5c1f1eb9b685d3,393828,2018-01-26 05:32:41.750000+00:00,10963,10969,2,2,264627.0,Not VIP,1,1,...,1,No,11568,969,"Coats: overcoats, raincoat, cape, cloaks and s...",488,21,33,Clothing,web
8727f12649fd2a6867810cc038cfd2af,24291,2018-01-05 09:15:42.543000+00:00,46276,46289,2,2,21690.0,Not VIP,14,3,...,1,No,51148,5,Other footwear,249,1,5,Shoes,web
27a748f0719a13fd7b06e3b9e9091044,75810,2018-01-10 00:45:26.290000+00:00,35947,35956,2,2,30485.0,Not VIP,17,4,...,1,Yes,39412,857,"Trousers, overalls, shorts",370,82,49,Girls Clothing,
d19e44090b58bf6d83faa2f3913b6250,200945,2018-01-28 02:10:52.810000+00:00,16639,16646,2,2,149603.0,Not VIP,7,1,...,0,Yes,17739,25,"Trousers, overalls, shorts",22,6,1318,Clothing,web
