# Webscraping Products from Under Armour Website

## 1. Importing Libraries

In [1]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time
import warnings
warnings.filterwarnings('ignore')

## 2. Extracting the Data from Website

In [2]:
%%time

# Part 1: Opening the webpage to be scraped
driver = webdriver.Chrome('C:/Users/Temp User/Desktop/Portfolio/chromedriver.exe')
driver.maximize_window()
driver.get('https://www.underarmour.com.sg/en-sg/c/men-now-trending/')

# Part 2: Scrolling all the way down to load the full webpage
old_height = driver.execute_script('return document.body.scrollHeight')

while True:
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(3)
    new_height = driver.execute_script('return document.body.scrollHeight')
    if new_height == old_height:
        break
    old_height = new_height

# Part 3: Scraping the website
soup = BeautifulSoup(driver.page_source, 'lxml')

listings = soup.find_all('div', 'b-tile bfx-price-product js-cmp-inited js-cmp-productTile')

# Part 4: Getting all the necessary info and putting into a DF
df = pd.DataFrame({'Link':[''], 'Item':[''], 'Colours':[''], 'Price':['']})

for lst in listings:
    try:
        link = lst.find('a', class_ = 'b-tile-images_container').get('href')
        full_link = 'https://www.underarmour.com.sg' + link
        product = lst.find('a', class_ = 'b-tile-name').text
        details = lst.find('a', class_ = 'b-tile-swatches_count').text.strip()
        price = lst.find('span', class_ = re.compile('b-price-value')).text.strip()
        
        df = df.append({'Link':full_link, 'Item':product, 'Colours':details, 'Price':price}, ignore_index = True)
        
    except:
        pass

CPU times: total: 7 s
Wall time: 2min 7s


In [3]:
df.head()

Unnamed: 0,Link,Item,Colours,Price
0,,,,
1,https://www.underarmour.com.sg/en-sg/p/accesso...,UA SPORTSMASK Featherweight,6 Colors,S$ 29.00
2,https://www.underarmour.com.sg/en-sg/p/best-se...,Men's UA Playoff Polo 2.0,10 Colors,S$ 89.00
3,https://www.underarmour.com.sg/en-sg/p/men-foo...,Unisex UA 3Z5 Basketball Shoes,2 Colors,S$ 139.00
4,https://www.underarmour.com.sg/en-sg/p/men-clo...,Men's UA Tech™ 2.0 Dash Short Sleeve,11 Colors,S$ 35.00


## 3. Cleaning up the Data

In [4]:
# Removing the first row
df.drop(0, inplace = True)

# Removing unwanted strings
df['Price'] = df['Price'].apply(lambda x: x.split()[1])
df['Colours'] = df['Colours'].apply(lambda x: x.split()[0])

# Changing dtypes
df['Colours'] = df['Colours'].astype(int)
df['Price'] = df['Price'].astype(float)

In [5]:
df.head()

Unnamed: 0,Link,Item,Colours,Price
1,https://www.underarmour.com.sg/en-sg/p/accesso...,UA SPORTSMASK Featherweight,6,29.0
2,https://www.underarmour.com.sg/en-sg/p/best-se...,Men's UA Playoff Polo 2.0,10,89.0
3,https://www.underarmour.com.sg/en-sg/p/men-foo...,Unisex UA 3Z5 Basketball Shoes,2,139.0
4,https://www.underarmour.com.sg/en-sg/p/men-clo...,Men's UA Tech™ 2.0 Dash Short Sleeve,11,35.0
5,https://www.underarmour.com.sg/en-sg/p/men-foo...,Unisex Curry Flow 9 Basketball Shoes,1,239.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426 entries, 1 to 426
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Link     426 non-null    object 
 1   Item     426 non-null    object 
 2   Colours  426 non-null    int32  
 3   Price    426 non-null    float64
dtypes: float64(1), int32(1), object(2)
memory usage: 11.8+ KB


## 4. Saving the Data into a CSV File

In [7]:
df.to_csv('under_armour_list.csv', index = False)