<a href="https://colab.research.google.com/github/MatheusSC017/PassMarkWebScraping/blob/main/PassMarkWebScrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PassMark Web Scraping

## Libraries

In [None]:
import bs4
import requests
import re
import pandas as pd
import numpy as np

## Scraping a video card page table

### Getting the data

In [None]:
url = 'https://www.videocardbenchmark.net/gpu_list.php'

response = requests.get(url)

html = bs4.BeautifulSoup(response.text, 'html.parser')

table = html.find(id='cputable')
head = table.find('thead')
body = table.find('tbody').find_all('tr')

### Table data scraping

In [None]:
def read_table_data(table):
    table_data = list()
    for line in table:
        line_data = line.find_all('td')
        table_data.append([data.text for data in line_data])
    return table_data
video_cards_data = read_table_data(body)

### Data typing and initial cleanup

In [None]:
def convert_float(value):
    try:
        value = re.sub(',', '', value)
        new_value = re.findall('[0-9]+\.?[0-9]{0,2}', value)
        if not len(new_value):
            return np.NaN
        return float(new_value[0])
    except TypeError:
        return np.NaN

In [None]:
def convert_int(value):
    try:
        value = re.findall('-?[0-9]+', value)
        if not len(value):
            return np.NaN
        return int(value[0])
    except TypeError:
        return np.NaN

In [None]:
for line in video_cards_data:
    line[1] = convert_int(line[1])
    line[2] = convert_int(line[2])
    line[3] = convert_float(line[3])
    line[4] = convert_float(line[4])

### Reading the table header

In [None]:
def read_table_head(head):
    columns = [column.text for column in head.find_all('th')]
    return columns
columns = read_table_head(head)

### DataFrame creation

In [None]:
video_cards_df = pd.DataFrame(data=video_cards_data, columns=columns)
video_cards_df

Unnamed: 0,Videocard Name,Passmark G3D Mark\n(higher is better),Rank\n(lower is better),Videocard Value\n(higher is better),Price\n(USD)
0,128 DDR Radeon 9700 TX w/TV-Out,44,1970,,
1,128 DDR Radeon 9800 Pro,62,1911,,
2,128MB DDR Radeon 9800 Pro,66,1900,,
3,128MB RADEON X600 SE,49,1950,0.87,56.18
4,15FF,8229,171,,
...,...,...,...,...,...
2303,Xabre,5,2160,,
2304,XFX Radeon HD 4650 AGP,109,1758,,
2305,XGI Volari Family v1.13.23.D_V,4,2179,,
2306,ZX Chrome 645/640 GPU,147,1652,,


### Data analysis

In [None]:
video_cards_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2307 entries, 0 to 2306
Data columns (total 5 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Videocard Name                        2307 non-null   object 
 1   Passmark G3D Mark
(higher is better)  2307 non-null   int64  
 2   Rank
(lower is better)                2307 non-null   int64  
 3   Videocard Value
(higher is better)    558 non-null    float64
 4   Price
(USD)                           558 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 90.2+ KB
None


In [None]:
video_cards_df.describe()

Unnamed: 0,Passmark G3D Mark\n(higher is better),Rank\n(lower is better),Videocard Value\n(higher is better),Price\n(USD)
count,2307.0,2307.0,558.0,558.0
mean,2109.438665,1114.320763,11.352294,432.625699
std,3917.060547,665.574216,13.382397,751.748734
min,1.0,-1.0,0.0,17.5
25%,129.0,537.5,1.7375,109.99
50%,660.0,1114.0,6.715,210.955
75%,1958.0,1690.5,16.89,434.18
max,30078.0,2267.0,143.92,8999.0


### Data cleaning

## Class to scrape the multi-page tables

In [None]:
class PassMarkPCComponentsAndAndroid:
    def __init__(self):
        self._urls = [
            'https://www.videocardbenchmark.net/gpu_list.php',
            'https://www.cpubenchmark.net/cpu_list.php',
            'https://www.harddrivebenchmark.net/hdd_list.php',
            'https://www.memorybenchmark.net/ram_list.php',
            'https://www.androidbenchmark.net/device_list.php',

        ]