In [58]:
import requests
import io
from bs4 import BeautifulSoup
import pandas as pd
from tabulate import tabulate
from typing import Tuple, List
import re
from datetime import datetime
import plotly
import plotly.graph_objects as go


In [59]:
def get_soup(url: str) -> BeautifulSoup:
    response = requests.get(url)
    return BeautifulSoup(response.content, 'html.parser')

def get_csv_from_url(url:str) -> pd.DataFrame:
    s=requests.get(url).content
    return pd.read_csv(io.StringIO(s.decode('utf-8')))

def print_tabulate(df: pd.DataFrame):
    print(tabulate(df, headers=df.columns, tablefmt='orgtbl'))

In [60]:
df = pd.read_csv("C:\\Users\\Admin\\Documents\\semestre\\mineria\\Adidas_Vs_Nike.csv")
df.head()
 

Unnamed: 0,Product Name,Product ID,Listing Price,Sale Price,Discount,Brand,Description,Rating,Reviews,Last Visited
0,Women's adidas Originals NMD_Racer Primeknit S...,AH2430,14999,7499,50,Adidas Adidas ORIGINALS,Channeling the streamlined look of an '80s rac...,4.8,41,2020-04-13T15:06:14
1,Women's adidas Originals Sleek Shoes,G27341,7599,3799,50,Adidas ORIGINALS,"A modern take on adidas sport heritage, tailor...",3.3,24,2020-04-13T15:06:15
2,Women's adidas Swim Puka Slippers,CM0081,999,599,40,Adidas CORE / NEO,These adidas Puka slippers for Women's come wi...,2.6,37,2020-04-13T15:06:15
3,Women's adidas Sport Inspired Questar Ride Shoes,B44832,6999,3499,50,Adidas CORE / NEO,"Inspired by modern tech runners, these Women's...",4.1,35,2020-04-13T15:06:15
4,Women's adidas Originals Taekwondo Shoes,D98205,7999,3999,50,Adidas ORIGINALS,This design is inspired by vintage Taekwondo s...,3.5,72,2020-04-13T15:06:15


In [121]:
discount_table = df["Discount"].value_counts()
discount_table

0     1300
50     994
40     763
30      99
60      61
20      49
Name: Discount, dtype: int64

In [131]:
gender_frame = pd.DataFrame(data=df['Product Name'])
gender_frame

Unnamed: 0,Product Name
0,Women's adidas Originals NMD_Racer Primeknit S...
1,Women's adidas Originals Sleek Shoes
2,Women's adidas Swim Puka Slippers
3,Women's adidas Sport Inspired Questar Ride Shoes
4,Women's adidas Originals Taekwondo Shoes
...,...
3261,Air Jordan 8 Retro
3262,Nike Phantom Venom Club IC
3263,Nike Mercurial Superfly 7 Academy TF
3264,Nike Air Max 98


In [132]:
gender_frame['Product Name'] = gender_frame['Product Name'].replace(regex={"^(Women)+(')?[\w \d\W]*?[\r\n]?$":'Women', "^(Men)+(')?[\w \d\W]*?[\r\n]?$":'Men', "^(Nike)+(')?[\w \d\W]*?[\r\n]?$":"Unisex", "^(Unisex)+(')?[\w \d\W]*?[\r\n]?$":"Unisex", "^(?!.*(Men|Women)|Unisex?).*$":"Other"})
gender_frame

Unnamed: 0,Product Name
0,Women
1,Women
2,Women
3,Women
4,Women
...,...
3261,Other
3262,Unisex
3263,Unisex
3264,Unisex


In [134]:
gender_frame.value_counts()


Product Name
Men             1745
Women            780
Unisex           646
Other             95
dtype: int64

In [64]:
discount_frame = pd.DataFrame(data = df["Discount"])
discount_frame

Unnamed: 0,Discount
0,50
1,50
2,40
3,50
4,50
...,...
3261,0
3262,0
3263,0
3264,0


In [65]:
discount_count = discount_frame["Discount"].value_counts().reset_index()
discount_count = discount_count.rename(columns={'index':'discount', 'Discount':'count'})
discount_count['percent'] = discount_count['count'].apply(lambda x: 100*x/sum(discount_count['count']))
discount_count

Unnamed: 0,discount,count,percent
0,0,1300,39.804042
1,50,994,30.434783
2,40,763,23.361911
3,30,99,3.031231
4,60,61,1.867728
5,20,49,1.500306


In [74]:
fig = go.Figure()

fig.add_trace(go.Bar(x=discount_count['discount'], y=discount_count['percent']))

fig.update_layout(
    xaxis=dict(
        title_text='Cantidad de descuento %',
        ticktext=["0%", "10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "100%"],
        tickvals=[0, 10, 20, 30, 40, 50, 60],
        titlefont=dict(size=20),
    ),
    yaxis=dict(
        title_text="Cantidad de productos en descuento %",
        ticktext=["0%", "10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "100%"],
        tickvals=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
        titlefont=dict(size=20)
    ),
    title='Tenis en descuento'
)

fig.show()

In [135]:
gender_count = gender_frame["Product Name"].value_counts().reset_index()
gender_count = gender_count.rename(columns={'index':'gender', 'Product Name':'count'})
gender_count['percent'] = gender_count['count'].apply(lambda x: 100*x/sum(gender_count['count']))
gender_count

Unnamed: 0,gender,count,percent
0,Men,1745,53.429271
1,Women,780,23.882425
2,Unisex,646,19.779547
3,Other,95,2.908757


In [136]:
fig = go.Figure()

fig.add_trace(go.Bar(x=gender_count['gender'], y=gender_count['percent']))

fig.update_layout(
    xaxis=dict(
        title_text='Género',
        titlefont=dict(size=20),
    ),
    yaxis=dict(
        title_text="Porcentaje %",
        ticktext=["0%", "10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "100%"],
        tickvals=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
        titlefont=dict(size=20)
    ),
    title="Porcentaje de productos determinados por género"
)

fig.show()