In [1]:
!pip install polars beautifulsoup4 splinter selenium



In [2]:
!pip install pint



In [3]:
import polars as pl
import pathlib
import re
from bs4 import BeautifulSoup
# Will use firefox browser
from splinter import Browser
import time
import pint

## Prepping the data. Do not include in main

In [4]:
cwd = pathlib.Path.cwd()

if cwd.name == 'Mild-Steel-Tempering':
    print("Path is project root")
else:
    print("Please correct current working directory to the project root")

Path is project root


In [5]:
resources_path = pathlib.PurePath(pathlib.PurePath(cwd), 'resources')
resources_path

PurePosixPath('/home/mox/Documents/coding_projects/bootcamp_local/Homeworks/Mild-Steel-Tempering/resources')

In [6]:
data_path = f"{resources_path}/searchable_steels.csv"
df_data = pl.read_csv(data_path)
df_data.glimpse()

Rows: 28
Columns: 1
$ steel_type <str> 'AISI-SAE 1045', 'AISI-SAE 1042', 'AISI 1030', 'AISI-SAE 4037', 'AISI 52100', 'AISI-SAE 1065', 'AISI-SAE 4340', 'AISI-SAE 1030', 'AISI-SAE 6150', 'AISI-SAE 5140'



## Visit the Website

In [7]:
browser = Browser('firefox')
base_url = "https://www.azom.com"
search_path = "/search.aspx?q="

In [8]:
dict_mt = {}

### Rename some steel present in AZoM that were not searching correctly

In [9]:
len([print(steel) for steel in df_data['steel_type']])

AISI-SAE 1045
AISI-SAE 1042
AISI 1030
AISI-SAE 4037
AISI 52100
AISI-SAE 1065
AISI-SAE 4340
AISI-SAE 1030
AISI-SAE 6150
AISI-SAE 5140
AISI-SAE 1035
AISI 4640
AISI 1055
AISI-SAE 1080
AISI 1090
AISI-SAE 4027
AISI-SAE 1050
AISI-SAE 1038
AISI 1095
AISI-SAE 1040
AISI-SAE 1026
AISI-SAE 5160
AISI 6145
AISI 4047
AISI 1074 Carbon Steel
AISI 1049
AISI-SAE 1335
AISI-SAE 4140


28

In [10]:
# Search the steel 
def get_soup(soup):
    search_result = soup.find('div', class_='resultsContainer')
    first_item = search_result.find('a')
    # https://pytutorial.com/get-element-href-beautifulsoup/
    steel_link = first_item.get('href')
    time.sleep(0.15)
    browser.visit(f"{base_url}{steel_link}")
    soup2 = BeautifulSoup(browser.html, 'html.parser')
    return soup2



In [11]:
# already have elemental composition
# property - metric - imperial
# table_elements = tables[0]
# table_phys_props = tables[1]
# table_mecha_props = tables[2]
# table_therm_props = tables[3]
# table_other_desigs = tables[4]
#parse tables
def get_tables(soup):
    tables = []
    # parse all tables in the
    for html_table in soup.find_all('table'):
        table = []
        # parse rows in the table
        for tr in html_table.find_all('tr'):
            row = []
            #parse data cells in the row
            for t in tr.find_all(['th', 'td']):
                text = t.get_text(strip=True)
                #add data to row
                row.append(text)
            #add row to table
            table.append(row)
    #returns a list of list of lists
    return tables
    

In [18]:
def make_dfs(lolols):
    tables = []
    # convert the list of lists to a dataframe
    for lol in lolols:
        try:
            df = pl.DataFrame(lol, orient='row')
            # rename columns using first row
            # https://stackoverflow.com/questions/75187317/how-to-rename-column-names-with-first-row-in-polars
            # [1:] removes the first row that replaced the column names 
            df = df.rename(df.head(1).to_dicts().pop())[1:]
            # add the dataframe to the list of tables on for this steel
            tables.append(df)
        except:
            tables.append(lol)
    return tables

In [19]:
dict_steel_tables = {}
for steel in df_data['steel_type']:
    print("Working on:", steel)
    #search AZoM for the steel
    browser.visit(f"{base_url}{search_path}{steel}")
    soup_search = BeautifulSoup(browser.html, 'html.parser')

    #takes soup html and returns list of lists
    soup = get_soup(soup_search)
    lolols = get_tables(soup)
    dict_steel_tables[steel] = make_dfs(lolols)
    # tables list of lists and returns a dictionary
    # robots.txt indicates Crawl-delay: 120
    time.sleep(0.15)

Working on: AISI-SAE 1045
Working on: AISI-SAE 1042
Working on: AISI 1030
Working on: AISI-SAE 4037
Working on: AISI 52100
Working on: AISI-SAE 1065
Working on: AISI-SAE 4340
Working on: AISI-SAE 1030
Working on: AISI-SAE 6150
Working on: AISI-SAE 5140
Working on: AISI-SAE 1035
Working on: AISI 4640
Working on: AISI 1055
Working on: AISI-SAE 1080
Working on: AISI 1090
Working on: AISI-SAE 4027
Working on: AISI-SAE 1050
Working on: AISI-SAE 1038
Working on: AISI 1095
Working on: AISI-SAE 1040
Working on: AISI-SAE 1026
Working on: AISI-SAE 5160
Working on: AISI 6145
Working on: AISI 4047
Working on: AISI 1074 Carbon Steel
Working on: AISI 1049
Working on: AISI-SAE 1335
Working on: AISI-SAE 4140


In [20]:
dict_steel_tables

{'AISI-SAE 1045': [],
 'AISI-SAE 1042': [],
 'AISI 1030': [],
 'AISI-SAE 4037': [],
 'AISI 52100': [],
 'AISI-SAE 1065': [],
 'AISI-SAE 4340': [],
 'AISI-SAE 1030': [],
 'AISI-SAE 6150': [],
 'AISI-SAE 5140': [],
 'AISI-SAE 1035': [],
 'AISI 4640': [],
 'AISI 1055': [],
 'AISI-SAE 1080': [],
 'AISI 1090': [],
 'AISI-SAE 4027': [],
 'AISI-SAE 1050': [],
 'AISI-SAE 1038': [],
 'AISI 1095': [],
 'AISI-SAE 1040': [],
 'AISI-SAE 1026': [],
 'AISI-SAE 5160': [],
 'AISI 6145': [],
 'AISI 4047': [],
 'AISI 1074 Carbon Steel': [],
 'AISI 1049': [],
 'AISI-SAE 1335': [],
 'AISI-SAE 4140': []}

# Itteration and troubleshooting code

In [11]:
browser = Browser('firefox')
browser.visit("https://www.azom.com/article.aspx?ArticleID=9189")
soup = BeautifulSoup(browser.html, 'html.parser')
#takes soup html and returns list of lists
# tables list of lists and returns a dictionary
# robots.txt indicates Crawl-delay: 120
time.sleep(0.15)
soup = BeautifulSoup(browser.html, 'html.parser')

In [25]:
tables = []
# parse all tables in the
for html_table in soup.find_all('table'):
    table = []
    # parse rows in the table
    for tr in html_table.find_all('tr'):
        row = []
        #parse data cells in the row
        for t in tr.find_all(['th', 'td']):
            text = t.get_text(strip=True)
            #add data to row
            row.append(text)
        #add row to table
        table.append(row)
    # convert the list of lists to a dataframe
    df = pl.DataFrame(table, orient='row')
    # rename columns using first row
    # https://stackoverflow.com/questions/75187317/how-to-rename-column-names-with-first-row-in-polars
    # [1:] removes the first row that replaced the column names 
    df = df.rename(df.head(1).to_dicts().pop())[1:]
    # add the dataframe to the list of tables on for this steel
    tables.append(df)
tables[2]

Properties,Metric,Imperial
str,str,str
"""Tensile strength""","""1213 MPa""","""175900 psi"""
"""Yield strength (@0.2%)""","""1165 MPa""","""169000 psi"""
"""Modulus of elasticity""","""201-209 GPa""","""29200-30300 ksi"""
"""Bulk modulus""","""160-170 GPa""","""23200-24700 ksi"""
"""Shear modulus""","""81-82 GPa""","""11700-11900 ksi"""
"""Poisson’s ratio""","""0.27-030""","""0.27-030"""
"""Elongation at break""","""16%""","""16%"""
"""Hardness, Brinell""","""429""","""429"""


IndexError: list index out of range

In [33]:
#  Parsing html tables - https://stackoverflow.com/questions/45843025/parsing-html-tables-with-beautifulsoup-in-python
# html_tables = soup2.find_all('tbody')
tables = [ # list of tables
    [ # list of rows in table
        # parse table data including headers
        [t.get_text(strip=True) for t in tr.find_all(['th', 'td'])]
        # for  each row of data
        for tr in table.find_all('tr')
    ] 
    for table in soup.find_all('table')
]
tables

[[['Element', 'Content (%)'],
  ['Chromium, Cr', '0.8-1.1'],
  ['Manganese, Mn', '≤0.7-0.9'],
  ['Carbon, C', '≤0.43-0.48'],
  ['Silicon, Si', '≤0.2-0.35'],
  ['Vanadium, V', '≥0.15'],
  ['Phosphorus, P', '≤0.04'],
  ['Sulfur, S', '≤0.05']],
 [['Properties', 'Metric', 'Imperial'],
  ['Density', '7.75 g/cm3', '0.280 lb/in3']],
 [['Properties', 'Metric', 'Imperial'],
  ['Tensile strength', '1213 MPa', '175900 psi'],
  ['Yield strength (@0.2%)', '1165 MPa', '169000 psi'],
  ['Modulus of elasticity', '201-209 GPa', '29200-30300 ksi'],
  ['Bulk modulus', '160-170 GPa', '23200-24700 ksi'],
  ['Shear modulus', '81-82 GPa', '11700-11900 ksi'],
  ['Poisson’s ratio', '0.27-030', '0.27-030'],
  ['Elongation at break', '16%', '16%'],
  ['Hardness, Brinell', '429', '429']],
 [['Properties', 'Metric', 'Imperial'],
  ['Thermal expansion co-efficient', '10 µm/m°C', '5.5 µin/in°F'],
  ['Thermal conductivity', '25 W/mK', '173.3 BTU.in/hrft².°F']]]

In [13]:
from selenium.webdriver.common.keys import Keys

browser = Browser('firefox')
i = 0

for steel in df_data['steel_type']:
    browser.visit(f"{base_url}{search_path}{steel}")
    soup = BeautifulSoup(browser.html, 'html.parser')
    try:
        search_result = soup.find('div', class_='resultsContainer')
        first_item = search_result.find('a')
        # https://pytutorial.com/get-element-href-beautifulsoup/
        steel_link = first_item.get('href')
        time.sleep(0.15)
        browser.visit(f"{base_url}{steel_link}")
        soup2 = BeautifulSoup(browser.html, 'html.parser')
        time.sleep(0.15)
    except:
        i += 1
        print("Failed : ", i, " : ", steel)
        
    

    

Failed :  1  :  AISI-SAE 9264
Failed :  2  :  AISI-SAE 2340
Failed :  3  :  AISI-SAE 3140
Failed :  4  :  AISI-SAE 4068


In [None]:
# property - metric - imperial
table_elements = tables[0]
table_phys_props = tables[1]
table_mecha_props = tables[2]
table_therm_props = tables[3]
table_other_desigs = tables[4]

In [21]:
dict_props = {}
table_mecha_props = tables[2]
for row in table_mecha_props:
    try:
        #fail if empty list
        to_parse_metric = row[1]
        # parse numeric value as string
        value = re.search(r"[\d+\.\d+]+", to_parse_metric).group(0)
        # parse space seperated value units that may end in numbers example: '7.858 g/ cm3'
        unit = ''.join(to_parse_metric.split(' ')[1:])
        if unit == '':
            # parse unit if not space separated example: "40.00%"
            unit = re.search(r"[\D]+$", to_parse_metric).group(0)

        #add values if no errors in parsing
        # dict_props['metric'].append(value)
        # dict_props['unit'].append(unit)
        # dict_props["property"].append(row[0].lower())
        dict_props['steel_type'] = steel
        dict_props[row[0].lower()] = (value, unit)
    except:
        pass
    df = pl.DataFrame(dict_props)

In [17]:
#Test Case
list_to_parse  = ['7.858 g/ cm3', "40.00% ", "29%", "163"]
for item in list_to_parse:
    value = re.search(r"[\d+\.\d+]+", item).group(0)
    unit = ''.join(item.split(' ')[1:])
    if unit == '':
        unit = re.search(r"[\D]+$", item).group(0)
    print('value', value)
    print('unit ', unit)

value 7.858
unit  g/cm3
value 40.00
unit  % 
value 29
unit  %


AttributeError: 'NoneType' object has no attribute 'group'

In [20]:
df_steel_properties = pl.DataFrame(dict_properties_mt.copy())
df_steel_properties

property,metric,unit
str,str,str
"""density (chemical composition …","""7.845""","""g/cc"""
"""melting point""","""1521""","""°C"""
"""tensile strength""","""620""","""MPa"""
"""yield strength""","""415""","""MPa"""
"""bulk modulus (typical for stee…","""140""","""GPa"""
…,…,…
"""izod impact (as rolled)""","""49""","""J"""
"""izod impact (normalized at 900…","""65""","""J"""
"""thermal expansion co-efficient…","""11.3""","""µm/m°C"""
"""thermal conductivity (@ 100°c/…","""50.7""","""W/mK"""


In [18]:
pl.DataFrame(properties_dict)

property,metric,unit
str,str,str
"""density (chemical composition …","""7.845""","""g/cc"""
"""melting point""","""1521""","""°C"""
"""tensile strength""","""620""","""MPa"""
"""yield strength""","""415""","""MPa"""
"""bulk modulus (typical for stee…","""140""","""GPa"""
…,…,…
"""izod impact (as rolled)""","""49""","""J"""
"""izod impact (normalized at 900…","""65""","""J"""
"""thermal expansion co-efficient…","""11.3""","""µm/m°C"""
"""thermal conductivity (@ 100°c/…","""50.7""","""W/mK"""


In [21]:
properties_dict['property']

['density (chemical composition of 0.435% c, 0.69% mn, 0.20% si, annealed at 860°c (1580°f))',
 'melting point',
 'tensile strength',
 'yield strength',
 'bulk modulus (typical for steels)',
 'shear modulus (typical for steels)',
 'elastic modulus',
 'elongation at break (in 50 mm)',
 'reduction of area',
 'izod impact (annealed at 790°c (1450°f))',
 'izod impact (as rolled)',
 'izod impact (normalized at 900°c (1650°f)',
 'thermal expansion co-efficient (@ 20-100°c/68-212°f, composition of 0.40% c, 0.11% mn, 0.01% p, 0.03% s, 0.03% si, 0.03% cu)',
 'thermal conductivity (@ 100°c/212°f)',
 'thermal conductivity (@ 0°c)']