In [14]:
!pip install polars beautifulsoup4 splinter selenium scikit-learn altair vl-convert-python tensorflow keras-tuner pandas



In [156]:
#Data Manipulation and display toools
import polars as pl
from polars.exceptions import InvalidOperationError
import altair as alt
import numpy as np
import pathlib
# Pandas is necessary for ease of use with the ML libraries
# Polars does not use indexing and therefore does not the following data format properly
#   {tablename: {index1: [row1_values]},
#               {index2: [row2_values]}}
import pandas as pd
from IPython.display import display_html 
#Necessary for tensorflow on my machine due to distutils being depreciated
import setuptools

# Web scraping tools
import re
from bs4 import BeautifulSoup
# Uses firefox browser
from splinter import Browser
import time

# Data preprocessing tools
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Machine learning tools
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score


from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import Sequential
import keras_tuner as kt
import tensorflow as tf

random_state = 2112250415

# Loading Data

In [16]:
cwd = pathlib.Path.cwd()

if cwd.name == 'Mild-Steel-Tempering':
    print("Path is project root")
else:
    print("Please correct current working directory to the project root")


Path is project root


In [17]:
resources_path = pathlib.PurePath(pathlib.PurePath(cwd), 'resources', 'pred_hardness')
resources_path

PurePosixPath('/home/mox/Documents/coding_projects/bootcamp_local/Homeworks/Mild-Steel-Tempering/resources/pred_hardness')

In [18]:
images_path = pathlib.PurePath(pathlib.PurePath(cwd), 'images', 'pred_hardness')
images_path

PurePosixPath('/home/mox/Documents/coding_projects/bootcamp_local/Homeworks/Mild-Steel-Tempering/images/pred_hardness')

### Many alloy composition columns were parsed incorrectly and failing to load

Several weight percent columns were parsed as int automatically due to having "0" for many initial rows. 

All weight percent columns should be parsed as float. 

In [19]:
data_path = f"{resources_path.parent}/Raiipa-tempering-data.csv"
schema_overrides = {"C (%wt)" : pl.Float64,
"Mn (%wt)" : pl.Float64,
"P (%wt)" : pl.Float64,
"S (%wt)" : pl.Float64,
"Si (%wt)" : pl.Float64,
"Ni (%wt)" : pl.Float64,
"Cr (%wt)" : pl.Float64,
"Mo (%wt)" : pl.Float64,
"V (%wt)" : pl.Float64,
"Al (%wt)" : pl.Float64,
"Cu (%wt)" : pl.Float64}

df_data = pl.read_csv(data_path, schema_overrides=schema_overrides)
df_data.glimpse()

Rows: 1466
Columns: 17
$ Source                                  <str> 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956'
$ Steel type                              <str> 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026'
$ Initial hardness (HRC) - post quenching <str> '?', '?', '?', '?', '?', '?', '?', '?', '?', '?'
$ Tempering time (s)                      <i64> 600, 600, 600, 600, 600, 600, 600, 600, 600, 600
$ Tempering temperature (ºC)              <f64> 204.4, 260.0, 315.6, 371.1, 426.7, 482.2, 537.8, 593.3, 648.9, 704.4
$ C (%wt)                                 <f64> 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25
$ Mn 

In [20]:
# save initial columns names as they are very descriptive and may be useful later
initial_column_names = df_data.columns
initial_column_names

['Source',
 'Steel type',
 'Initial hardness (HRC) - post quenching',
 'Tempering time (s)',
 'Tempering temperature (ºC)',
 'C (%wt)',
 'Mn (%wt)',
 'P (%wt)',
 'S (%wt)',
 'Si (%wt)',
 'Ni (%wt)',
 'Cr (%wt)',
 'Mo (%wt)',
 'V (%wt)',
 'Al (%wt)',
 'Cu (%wt)',
 'Final hardness (HRC) - post tempering']

# Cleaning
Many columns need renaming for ease of manipulation

Columns need datatypes correction
 'Initial hardness (HRC) - post quenching' needs datatype correction. ? is NA value


In [21]:
dict_new_cnames = {}
for og_name in initial_column_names:
    # replace filler in hardness columns
    new_name = og_name.replace(' - ', '')
    
    # Handle units
    try:
        #if alloy weight percent remove units and return only elemental symbol
        if re.search(r"\(%wt\)", og_name):
            new_name = og_name.split(' ')[0]
            #skip the rest of the try block that will re_add the units to the end
            pass
        else: 
            #If not elemental composition, lowercase the string
            new_name = new_name.lower()
        # regex find the units inside the parenthesis, of the original name, not the lowercased new name
        # This lines breaks and goes to except if there is no units
        unit = re.search(r'\((\w+)\)', og_name).group(1)
        # replace the unit parenthesis string with parenthesis with an empty string
        new_name = re.sub(r"\(.+\)", "", new_name)
        # trim to whitespace end characters left by some unit removals
        new_name = new_name.rstrip()
        # append the unit string to the end of the processed name
        new_name = f"{new_name}_{unit}"
    except:
        # skip unit processing on names with no units denoted by parenthesis
        pass

    # Strip away special characters
    new_name = new_name.encode("ascii", errors="ignore").decode()
    #replace all whitespace with underscores
    new_name = new_name.replace(' ', '_')
    #add the name to the rename dict
    dict_new_cnames[og_name] = new_name
dict_new_cnames


{'Source': 'source',
 'Steel type': 'steel_type',
 'Initial hardness (HRC) - post quenching': 'initial_hardness_post_quenching_HRC',
 'Tempering time (s)': 'tempering_time_s',
 'Tempering temperature (ºC)': 'tempering_temperature_C',
 'C (%wt)': 'C',
 'Mn (%wt)': 'Mn',
 'P (%wt)': 'P',
 'S (%wt)': 'S',
 'Si (%wt)': 'Si',
 'Ni (%wt)': 'Ni',
 'Cr (%wt)': 'Cr',
 'Mo (%wt)': 'Mo',
 'V (%wt)': 'V',
 'Al (%wt)': 'Al',
 'Cu (%wt)': 'Cu',
 'Final hardness (HRC) - post tempering': 'final_hardness_post_tempering_HRC'}

In [22]:
df_clean_cnames = df_data.rename(dict_new_cnames)

## Clean data types and column values

In [23]:
count_of_qmark = df_clean_cnames['initial_hardness_post_quenching_HRC'].value_counts()\
    .filter(pl.col('initial_hardness_post_quenching_HRC') == "?")\
        .select("count").item()


In [24]:
percent_intial_hardness_unknown = count_of_qmark / int(df_clean_cnames['initial_hardness_post_quenching_HRC'].shape[0])
print(f"Unknown initial hardness: {round(percent_intial_hardness_unknown, 2) * 100:.0f}%")


Unknown initial hardness: 65%


In [25]:
df_clean_cnames.tail(3)


source,steel_type,initial_hardness_post_quenching_HRC,tempering_time_s,tempering_temperature_C,C,Mn,P,S,Si,Ni,Cr,Mo,V,Al,Cu,final_hardness_post_tempering_HRC
str,str,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""","""64.5""",86400,500.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,32.0
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""","""64.5""",86400,600.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,23.0
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""","""64.5""",86400,700.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,4.5


In [26]:
df_clean_cnames["initial_hardness_post_quenching_HRC"].value_counts().sort('count', descending=True).head(3)

initial_hardness_post_quenching_HRC,count
str,u32
"""?""",949
"""66.5""",90
"""61.6""",51


In [27]:
df_clean = df_clean_cnames.with_columns(pl.col('initial_hardness_post_quenching_HRC').cast(pl.Float64, strict=False))


In [28]:
df_clean["initial_hardness_post_quenching_HRC"].value_counts().sort('count', descending=True).head(3)

initial_hardness_post_quenching_HRC,count
f64,u32
,949
66.5,90
63.1,51


In [29]:
df_clean.describe()

statistic,source,steel_type,initial_hardness_post_quenching_HRC,tempering_time_s,tempering_temperature_C,C,Mn,P,S,Si,Ni,Cr,Mo,V,Al,Cu,final_hardness_post_tempering_HRC
str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""1466""","""1466""",517.0,1466.0,1466.0,1466.0,1466.0,1466.0,1466.0,1466.0,1466.0,1466.0,1466.0,1466.0,1466.0,1466.0,1466.0
"""null_count""","""0""","""0""",949.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,,61.493617,21969.754434,422.024147,0.511583,0.74073,0.017236,0.023802,0.239379,0.362838,0.389696,0.080232,0.005457,0.034379,0.005986,41.468008
"""std""",,,5.656383,34177.623863,176.088041,0.224354,0.252913,0.007966,0.007967,0.239193,0.810091,0.480721,0.121422,0.02905,0.20534,0.019085,14.079248
"""min""","""Grange and Baughman, 1956""","""0,31%C - plain carbon steel""",46.5,10.0,100.0,0.25,0.3,0.007,0.005,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.9
"""25%""",,,58.8,600.0,260.0,0.37,0.6,0.012,0.018,0.16,0.0,0.02,0.0,0.0,0.0,0.0,32.1
"""50%""",,,63.1,3600.0,426.7,0.42,0.74,0.017,0.024,0.21,0.01,0.06,0.0,0.0,0.0,0.0,43.1
"""75%""",,,66.5,14400.0,593.3,0.56,0.8,0.019,0.029,0.24,0.06,0.8,0.22,0.0,0.0,0.0,51.8
"""max""","""Penha, 2010""","""Nitriding Steel """,67.0,115200.0,704.4,1.15,1.85,0.054,0.055,1.62,3.41,1.57,0.36,0.16,1.26,0.08,68.5


# Making steel types values searchable on AZoM.com
Steel types that do not include AISI and a code are not searchable in steel databased and need manual renaming

Exmaple: "0,74%C - plain carbon steel" is a AISI 1074 steel. Confirmed via elemental composition on https://www.azom.com/article.aspx?ArticleID=6558


In [30]:
steel_identifiers = ["steel_type", "source"]
df_clean[steel_identifiers].unique().shape

(36, 2)

## Renaming all possible steels so they are searchable

1945 source

I can not access source papers. Making a best guess of steel grade from elemental composition

I assume some alloying elements are a bit strange due to wartime shortages and potential contamination between alloys due to the push to increasew prodiction.

### additional Justifications are inline

In [31]:
#1945 source...
# I can not access source for the 1945 paper on these
# I assume they alloying elements are a bit strange due to wartime shortages
# and potentially contamination between alloys due to the push to increasew prodiction
dict_rename = {'0,98%C - plain carbon steel': 'AISI 1095', # copper and Cr, probably intentional?
                   '1,15%C - plain carbon steel': 'AISI 1095', # small chromium impurity?
                   '0,74%C - plain carbon steel': 'AISI 1074 Carbon Steel', 
                   '0,56%C - plain carbon steel': 'AISI 1055', # small chromium impurity?
                   '0,89%C - plain carbon steel': 'AISI 1090', # Cu discounted as impurity
                   'Nitriding Steel ': 'Non-searchable', 
                   '0,31%C - plain carbon steel': 'AISI 1030', # AISI 1030, Cr impurity
                   # These 4 steels are automotive steels and free access to the properties is unavailable to my knowledge
                   "AISI-SAE 9264" : 'AISI-SAE 9254', # access limited, may be AISI-SAE 9264 , not in AZoM
                   "AISI-SAE 2340" : 'Non-searchable', # access limited, may be SAE J2340, not in AZoM
                   "AISI-SAE 3140" : 'Non-searchable', # Not in AZoM. elemental match to SAE 3140 https://www.steel-grades.com/metals/18/5155/-SAE-3140.html
                   "AISI-SAE 4068" : 'Non-searchable', # access limited, may be SAE 4068, not in AZoM
                
                    "AISI-SAE 4640" : "AISI 4640",
                   "AISI-SAE 4047" : "AISI 4047",
                   "AISI-SAE 1049" : "AISI 1049",
                   "AISI-SAE 6145" : "AISI 6145",
                   "AISI-SAE E52100" : "AISI 52100"} # # Not in AZoM elemental match to  SAE 4068https://www.steel-grades.com/Steel-Grades/Carbon-Steel/SAE-4068-.html

In [32]:
series_searchable_steel = df_clean.clone()
renamed = series_searchable_steel['steel_type'].replace(dict_rename)
series_searchable_steel = series_searchable_steel.with_columns(searchable = renamed)

In [33]:
series_to_search = series_searchable_steel['searchable'].unique()
steels_to_search = series_to_search.filter(series_to_search.eq('Non-searchable').not_())
steels_to_search

searchable
str
"""AISI-SAE 6150"""
"""AISI-SAE 1065"""
"""AISI-SAE 5140"""
"""AISI-SAE 1050"""
"""AISI-SAE 1035"""
…
"""AISI-SAE 1030"""
"""AISI-SAE 1040"""
"""AISI-SAE 9254"""
"""AISI 1030"""


## Save Searchable steel
Checkpoint so that the code to this point does not need to be run every time

In [34]:
path_save_searchable = f"{resources_path}/searchable_steels.csv"
pl.DataFrame(steels_to_search).write_csv(path_save_searchable)

# Web Scraping

Not all steels are documented in a way that allowed them to be searched.

Several steels are denoted by "x.xx% plain carbon steel." These are not standard searchable, except manually. A manual search was done, a few of the older steels do not conform to any known standard, particularly the steels with copper present. 

It is speculated that these are wartime steels from WW2 and alloying was done to reach the desired properties, not necessarily to adhere to a standard, especially considering there were material shortages, especially in some preferred alloying elements. The manufacturing crucibles also could not have been fully cleaned between batches of different material, resulting in contaimination between, say, a brass batch and a steel batch, resulting in elevated copper concentrations in the steel.

Some SAE are not available on the free services and were also not incuded in the final dataset.
---

Two stages of scraping are necessary.

AZoM.com does not have data on all steels, neither does MakeItFrom.com.

Scraping from both will be done to get the most infortmation possible. 

Using both we can scrape additional data for 29 of the 36 steels present in the Raiipa dataset. 

1286 rows retained of the original 1467.

In [35]:
len([print(steel) for steel in steels_to_search])

AISI-SAE 6150
AISI-SAE 1065
AISI-SAE 5140
AISI-SAE 1050
AISI-SAE 1035
AISI-SAE 1042
AISI-SAE 4027
AISI-SAE 5160
AISI-SAE 1038
AISI-SAE 1335
AISI-SAE 1026
AISI 1049
AISI 4640
AISI 6145
AISI 1055
AISI 4047
AISI 1074 Carbon Steel
AISI-SAE 4037
AISI 1090
AISI 52100
AISI-SAE 1045
AISI-SAE 1080
AISI-SAE 4140
AISI-SAE 4340
AISI-SAE 1030
AISI-SAE 1040
AISI-SAE 9254
AISI 1030
AISI 1095


29

## Scraping AZoM.com

In [36]:
browser = Browser('firefox')
base_url = "https://www.azom.com"
search_path = "/search.aspx?q="

In [37]:
# Search the steel 
def get_soup(soup):
    search_result = soup.find('div', class_='resultsContainer')
    first_item = search_result.find('a')
    # https://pytutorial.com/get-element-href-beautifulsoup/
    steel_link = first_item.get('href')
    time.sleep(0.15)
    browser.visit(f"{base_url}{steel_link}")
    soup2 = BeautifulSoup(browser.html, 'html.parser')
    return soup2


In [38]:
# already have elemental composition
# property - metric - imperial
# table_elements = tables[0]
# table_phys_props = tables[1]
# table_mecha_props = tables[2]
# table_therm_props = tables[3]
# table_other_desigs = tables[4]
#parse tables
def get_tables(soup):
    tables = []
    # parse all tables in the
    for html_table in soup.find_all('table'):
        table = []
        # parse rows in the table
        for tr in html_table.find_all('tr'):
            row = []
            #parse data cells in the row
            for t in tr.find_all(['th', 'td']):
                text = t.get_text(strip=True)
                #add data to row
                row.append(text)
            #add row to table
            table.append(row)
    #returns a list of list of lists
        tables.append(table)
    return tables
    

In [39]:
def make_dfs(lolols):
    tables = []
    # convert the list of lists to a dataframe
    for lol in lolols:
        try:
            df = pl.DataFrame(lol, orient='row')
            # rename columns using first row
            # https://stackoverflow.com/questions/75187317/how-to-rename-column-names-with-first-row-in-polars
            # [1:] removes the first row that replaced the column names 
            df = df.rename(df.head(1).to_dicts().pop())[1:]
            # add the dataframe to the list of tables on for this steel
            tables.append(df)
        except:
            tables.append(lol)
    return tables

In [40]:
dict_steel_tables = {}
for steel in steels_to_search:
    try:
        print("Working on:", steel)
        #search AZoM for the steel
        browser.visit(f"{base_url}{search_path}{steel}")
        soup_search = BeautifulSoup(browser.html, 'html.parser')
    
        #takes soup html and returns list of lists
        soup = get_soup(soup_search)
        lolols = get_tables(soup)
        dict_steel_tables[steel] = make_dfs(lolols)
        # tables list of lists and returns a dictionary
        # robots.txt indicates Crawl-delay: 120
        time.sleep(0.15)
    except: 
        dict_steel_tables[steel] = []

Working on: AISI-SAE 6150
Working on: AISI-SAE 1065
Working on: AISI-SAE 5140
Working on: AISI-SAE 1050
Working on: AISI-SAE 1035
Working on: AISI-SAE 1042
Working on: AISI-SAE 4027
Working on: AISI-SAE 5160
Working on: AISI-SAE 1038
Working on: AISI-SAE 1335
Working on: AISI-SAE 1026
Working on: AISI 1049
Working on: AISI 4640
Working on: AISI 6145
Working on: AISI 1055
Working on: AISI 4047
Working on: AISI 1074 Carbon Steel
Working on: AISI-SAE 4037
Working on: AISI 1090
Working on: AISI 52100
Working on: AISI-SAE 1045
Working on: AISI-SAE 1080
Working on: AISI-SAE 4140
Working on: AISI-SAE 4340
Working on: AISI-SAE 1030
Working on: AISI-SAE 1040
Working on: AISI-SAE 9254
Working on: AISI 1030
Working on: AISI 1095


In [41]:
dict_steel_tables

{'AISI-SAE 6150': [shape: (8, 2)
  ┌────────────────┬────────────────┐
  │ Element        ┆ Content (%)    │
  │ ---            ┆ ---            │
  │ str            ┆ str            │
  ╞════════════════╪════════════════╡
  │ Iron, Fe       ┆ 97.095 - 97.72 │
  │ Chromium, Cr   ┆ 0.800 - 1.10   │
  │ Manganese, Mn  ┆ 0.7 - 0.9      │
  │ Carbon, C      ┆ 0.480 - 0.530  │
  │ Silicon, Si    ┆ 0.150 - 0.3    │
  │ Vanadium, V    ┆ ≥ 0.150        │
  │ Sulfur, S      ┆ ≤ 0.04         │
  │ Phosphorous, P ┆ ≤ 0.0350       │
  └────────────────┴────────────────┘,
  shape: (1, 3)
  ┌────────────┬────────────┬──────────────┐
  │ Properties ┆ Metric     ┆ Imperial     │
  │ ---        ┆ ---        ┆ ---          │
  │ str        ┆ str        ┆ str          │
  ╞════════════╪════════════╪══════════════╡
  │ Density    ┆ 7.85 g/cm3 ┆ 0.284 lb/in³ │
  └────────────┴────────────┴──────────────┘,
  shape: (14, 3)
  ┌─────────────────────────────────┬─────────────┬─────────────────┐
  │ Properties 

# Converting AZoM Scraped Data to one dataframe

## Working with list of tables

Must confirma all tables were sucesffuly made into polars tables

Must reduce the numebr of tables to only those we are concerned with

* Remove "Other Designations" tables, including those that did not get sucessfully converted to a polars dataframe (table 4)

* Remove tables with properties that are not commonly present (thermal propterties table 3)

* Remove elemental compositions, already present in the data (table 0)

* Physical and mechanical properties are desired (tables 1,2)

In [42]:
dict_steel_tables_keep = {}
bool_check = True
polars_check = True 
for steel, tables in dict_steel_tables.items():
    all_polars = True
    count_polars = 0
    for table in tables:
        if isinstance(table, pl.DataFrame): 
            count_polars += 1 
    if count_polars != len(tables):
        all_polars = False
    if not (bool_check and all_polars):
        bool_check = False
    print("All polars?", all_polars, '|', len(tables), "Tables | Steel:", steel)
    dict_steel_tables_keep[steel] = tables[1:3]
print(polars_check)

All polars? True | 5 Tables | Steel: AISI-SAE 6150
All polars? True | 4 Tables | Steel: AISI-SAE 1065
All polars? True | 5 Tables | Steel: AISI-SAE 5140
All polars? True | 4 Tables | Steel: AISI-SAE 1050
All polars? False | 5 Tables | Steel: AISI-SAE 1035
All polars? True | 4 Tables | Steel: AISI-SAE 1042
All polars? True | 4 Tables | Steel: AISI-SAE 4027
All polars? True | 4 Tables | Steel: AISI-SAE 5160
All polars? True | 4 Tables | Steel: AISI-SAE 1038
All polars? True | 4 Tables | Steel: AISI-SAE 1335
All polars? True | 5 Tables | Steel: AISI-SAE 1026
All polars? True | 3 Tables | Steel: AISI 1049
All polars? True | 4 Tables | Steel: AISI 4640
All polars? True | 4 Tables | Steel: AISI 6145
All polars? True | 4 Tables | Steel: AISI 1055
All polars? True | 4 Tables | Steel: AISI 4047
All polars? True | 3 Tables | Steel: AISI 1074 Carbon Steel
All polars? True | 4 Tables | Steel: AISI-SAE 4037
All polars? True | 4 Tables | Steel: AISI 1090
All polars? True | 4 Tables | Steel: AISI 521

### Check if all kept tables are good

Confirm all are polars dataframes

Look at the shape of all of them and determine which will be limiting

In [43]:
dict_phys_props = {}
dict_mech_props = {}

polars_check = True 
for steel, tables in dict_steel_tables_keep.items():
    try:
        all_polars = True
        count_polars = 0
        for table in tables:
            if isinstance(table, pl.DataFrame): 
                count_polars += 1 
        if count_polars != len(tables):
            all_polars = False
        if not (bool_check and all_polars):
            bool_check = False
        print("All polars?", all_polars, '|', len(tables), "Tables | Steel:", steel)

        phys_props = tables[0]
        mech_props = tables[1]
        print("Physical properties:", phys_props.shape, "| Properties head(1)", phys_props.select(pl.col('Properties').head(1)) )
        print("Mechanical propsshape:", mech_props.shape, "| Properties head(1)", mech_props.select(pl.col('Properties').head(1)) )
        dict_phys_props[steel] = phys_props
        dict_mech_props[steel] = mech_props
    except:
        dict_phys_props[steel] = pl.DataFrame({'Properties' : [], 'Metric' : []})
        dict_mech_props[steel] = pl.DataFrame({'Properties' : [], 'Metric' : []})

print("All Polars?", polars_check)

All polars? True | 2 Tables | Steel: AISI-SAE 6150
Physical properties: (1, 3) | Properties head(1) shape: (1, 1)
┌────────────┐
│ Properties │
│ ---        │
│ str        │
╞════════════╡
│ Density    │
└────────────┘
Mechanical propsshape: (14, 3) | Properties head(1) shape: (1, 1)
┌────────────────────────────┐
│ Properties                 │
│ ---                        │
│ str                        │
╞════════════════════════════╡
│ Tensile strength, ultimate │
└────────────────────────────┘
All polars? True | 2 Tables | Steel: AISI-SAE 1065
Physical properties: (1, 3) | Properties head(1) shape: (1, 1)
┌────────────┐
│ Properties │
│ ---        │
│ str        │
╞════════════╡
│ Density    │
└────────────┘
Mechanical propsshape: (14, 3) | Properties head(1) shape: (1, 1)
┌────────────────────────────┐
│ Properties                 │
│ ---                        │
│ str                        │
╞════════════════════════════╡
│ Tensile strength, ultimate │
└──────────────────────────

# Filter for complete data, and select unit standard

Kpep only property and metric column from both sets of tables

Find which properties are present in all tables of each type

In [44]:
def remove_imperial(property_dict):
    new_dict = {}
    for key, table in property_dict.items():
        new_dict[key] = table.select(["Properties", "Metric"])
    return new_dict

dict_phys_props =  remove_imperial(dict_phys_props)
dict_mech_props = remove_imperial(dict_mech_props)


dict_phys_props['AISI-SAE 1050']

Properties,Metric
str,str
"""Density""","""7.85 g/cm3"""


# Properties to keep

Only properties present in all metals are the following:

Physical
* Density

Mechanical
* Elastic modulus
* Poisson's ratio

### Treat 2 values that do not have spaces seperating the units from the digits

In [45]:
print(dict_phys_props['AISI 1074 Carbon Steel'][0, "Metric"])
print(dict_mech_props['AISI-SAE 1065'][2, "Metric"])

7.7-8.03g/cm3
200GPa


In [46]:
#fix formatting in single cell. Did not incude a space 
# Could handle with regex, but onyl 2 errors are present so manual handling suffices
# The project data has fixed scope, so this is acceptable.
# arrives as '7.7-8.03g/cm3'
dict_phys_props['AISI 1074 Carbon Steel'][0, "Metric"] = '7.7-8.03 g/cm3'
# arrives as '200GPa'
dict_mech_props['AISI-SAE 1065'][2, "Metric"] = '200 GPa'
dict_mech_props['AISI-SAE 1065']

Properties,Metric
str,str
"""Tensile strength, ultimate""","""635 MPa"""
"""Tensile strength, yield""","""490 MPa"""
"""Modulus of elasticity""","""200 GPa"""
"""Bulk modulus (typical for stee…","""140 GPa"""
"""Shear modulus (typical for ste…","""80 GPa"""
…,…
"""Hardness, Knoop (converted fro…","""209"""
"""Hardness, Rockwell B (converte…","""90"""
"""Hardness, Rockwell C (converte…","""10"""
"""Hardness, Vickers (converted f…","""196"""


## Process Density into a df

In [47]:
s_steel = pl.Series(name = 'steel', dtype= pl.String)
s_density = pl.Series(name = 'density', dtype= pl.String)
s_unit = pl.Series(name = 'units_density', dtype= pl.String)

for steel, table in dict_phys_props.items():
    try:
        #select only the  cell of interest
        df_density = table.filter(pl.col('Properties').str.contains("Density"))['Metric'].str.split(' ')
        density = df_density.list.get(0)
        unit = df_density.list.get(1)
        # Make a series of all 3
        s_steel.extend(pl.Series(name = 'steel', values = [steel]))
        s_density = s_density.extend(density)
        s_unit = s_unit.extend(unit)
    except:
        print("Failed on", steel)

Failed on AISI-SAE 9254


In [48]:
df_steel_properties = pl.DataFrame([s_steel, s_density, s_unit])
df_steel_properties

steel,density,units_density
str,str,str
"""AISI-SAE 6150""","""7.85""","""g/cm3"""
"""AISI-SAE 1065""","""7.85""","""g/cm3"""
"""AISI-SAE 5140""","""7.85""","""g/cm3"""
"""AISI-SAE 1050""","""7.85""","""g/cm3"""
"""AISI-SAE 1035""","""7.85""","""g/cm3"""
…,…,…
"""AISI-SAE 4340""","""7.85""","""g/cm3"""
"""AISI-SAE 1030""","""7.85""","""g/cc"""
"""AISI-SAE 1040""","""7.845""","""g/cc"""
"""AISI 1030""","""7.85""","""g/cc"""


## Process mechanical properties

In [49]:
def grab_property(table, property):
    s_result = table.filter(pl.col('Properties')\
                            .str.to_lowercase()\
                                .str.contains(property))\
                                ['Metric']
    return s_result

In [50]:
def get_oobT(df, index):
    return df.list.get(index, null_on_oob=True)

In [51]:
# list of tuples if data was sucessfully scraped from AZoM
azom_sucessful = {'steel':[],'AZoM-Successful':[]}
list_of_scraped = [("steel", "pr", "em", "em_unit", "ys", "ys_unit", "uts", "uts_unit")]

for steel, table in dict_mech_props.items():
    pr = None
    em = None
    em_unit = None
    ys = None
    ys_unit = None
    uts = None
    uts_unit = None

    try:
        # Grab values and split into units where necessary
        #poisson's ratio
        pr = grab_property(table, "poisson")

        #elastic modulus
        df_em = grab_property(table, "elastic").str.split(' ')
        em = get_oobT(df_em, 0)
        em_unit = get_oobT(df_em, 1)
        # em = df_em.list.get(0, null_on_oob=True)
        # em_unit = df_em.list.get(1, null_on_oob=True)

        # grab yield strength
        df_ys = grab_property(table, "yield").str.split(' ')
        ys = get_oobT(df_ys, 0)
        ys_unit = get_oobT(df_ys, 1)

        # grab strength value that is not yield strength.
        # Tensile strength or ultimate yield strength or ultimate tensile strength. All the same measurement
        df_uts = table.filter(pl.col('Properties').str.to_lowercase().str.contains("strength") & 
                            pl.col('Properties').str.to_lowercase().str.contains("yield").not_())\
                                ['Metric'].str.split(' ')
        uts = get_oobT(df_uts, 0)
        uts_unit = get_oobT(df_uts, 1)
    
        # Make a tuple for each row
        # If this fails it will be an IndexError and go to the except
        row = (steel, pr[0], em[0], em_unit[0], ys[0], ys_unit[0], uts[0], uts_unit[0])

        list_of_scraped.append(row)
        azom_sucessful['steel'].append(steel)
        azom_sucessful['AZoM-Successful'].append(True)
        
    except (IndexError, InvalidOperationError):
        print("Failed on", steel)
        azom_sucessful['steel'].append(steel)
        azom_sucessful['AZoM-Successful'].append(False)

df_AZoM_successfull = pl.DataFrame(azom_sucessful)
df_AZoM_successfull.filter(pl.col('AZoM-Successful')==False)

Failed on AISI-SAE 1042
Failed on AISI-SAE 1335
Failed on AISI 1049
Failed on AISI 4047
Failed on AISI-SAE 4037
Failed on AISI 52100
Failed on AISI-SAE 1080
Failed on AISI-SAE 9254


steel,AZoM-Successful
str,bool
"""AISI-SAE 1042""",False
"""AISI-SAE 1335""",False
"""AISI 1049""",False
"""AISI 4047""",False
"""AISI-SAE 4037""",False
"""AISI 52100""",False
"""AISI-SAE 1080""",False
"""AISI-SAE 9254""",False


## Convert to dataframes and save

Checkpoint for continued work in this file so the scraping does not need to be done every time.

In [52]:
ttCdf_temp = pl.DataFrame(list_of_scraped, orient='row')
# rename columns using first row
# https://stackoverflow.com/questions/75187317/how-to-rename-column-names-with-first-row-in-polars
# [1:] removes the first row that replaced the column names 
ttCdf_temp = ttCdf_temp.rename(ttCdf_temp.head(1).to_dicts().pop())[1:]
ttCdf_temp.tail(3)

steel,pr,em,em_unit,ys,ys_unit,uts,uts_unit
str,str,str,str,str,str,str,str
"""AISI-SAE 1040""","""0.27-0.30""","""190-210""","""GPa""","""415""","""MPa""","""620""","""MPa"""
"""AISI 1030""","""0.27-0.30""","""190-210""","""GPa""","""440""","""MPa""","""525""","""MPa"""
"""AISI 1095""","""0.27-0.30""","""190-210""","""GPa""","""525""","""MPa""","""685""","""MPa"""


In [53]:
df_steel_properties = df_steel_properties.join(ttCdf_temp, on='steel', how='inner')
df_steel_properties.write_csv(f'{resources_path}/scraped_properties.csv')
df_steel_properties


steel,density,units_density,pr,em,em_unit,ys,ys_unit,uts,uts_unit
str,str,str,str,str,str,str,str,str,str
"""AISI-SAE 6150""","""7.85""","""g/cm3""","""0.27 – 0.30""","""190-210""","""GPa""","""415""","""MPa""","""670""","""MPa"""
"""AISI-SAE 1065""","""7.85""","""g/cm3""","""0.27-0.30""","""200""","""GPa""","""490""","""MPa""","""635""","""MPa"""
"""AISI-SAE 5140""","""7.85""","""g/cm3""","""0.27-0.30""","""190-210""","""GPa""","""295""","""MPa""","""570""","""MPa"""
"""AISI-SAE 1050""","""7.85""","""g/cm3""","""0.27-0.30""","""190-210""","""GPa""","""580""","""MPa""","""690""","""MPa"""
"""AISI-SAE 1035""","""7.85""","""g/cm3""","""0.27-0.30""","""190-210""","""GPa""","""370""","""MPa""","""585""","""MPa"""
…,…,…,…,…,…,…,…,…,…
"""AISI-SAE 4340""","""7.85""","""g/cm3""","""0.27-0.30""","""190-210""","""GPa""","""470""","""MPa""","""745""","""MPa"""
"""AISI-SAE 1030""","""7.85""","""g/cc""","""0.27-0.30""","""190-210""","""GPa""","""440""","""MPa""","""525""","""MPa"""
"""AISI-SAE 1040""","""7.845""","""g/cc""","""0.27-0.30""","""190-210""","""GPa""","""415""","""MPa""","""620""","""MPa"""
"""AISI 1030""","""7.85""","""g/cc""","""0.27-0.30""","""190-210""","""GPa""","""440""","""MPa""","""525""","""MPa"""


In [54]:
s_AZOM_unsuccessful = df_AZoM_successfull.filter(pl.col('AZoM-Successful')==False)['steel']
pl.DataFrame(s_AZOM_unsuccessful).write_csv(f'{resources_path}/MIF_search.csv')

# Process units from scraped AZoM data

Ensure all units are base units and the same down each column

In [55]:
data_path = f"{resources_path}/scraped_properties.csv"
df_scraped = pl.read_csv(data_path)
df_scraped.glimpse()

Rows: 21
Columns: 10
$ steel         <str> 'AISI-SAE 6150', 'AISI-SAE 1065', 'AISI-SAE 5140', 'AISI-SAE 1050', 'AISI-SAE 1035', 'AISI-SAE 4027', 'AISI-SAE 5160', 'AISI-SAE 1038', 'AISI-SAE 1026', 'AISI 4640'
$ density       <str> '7.85', '7.85', '7.85', '7.85', '7.85', '7.85', '7.85', '7.845', '7.858', '7.75'
$ units_density <str> 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/', 'g/cm3'
$ pr            <str> '0.27 – 0.30', '0.27-0.30', '0.27-0.30', '0.27-0.30', '0.27-0.30', '0.27-0.30', '0.27 – 0.30', '0.27-0.30', '0.27-0.30', '0.27-030'
$ em            <str> '190-210', '200', '190-210', '190-210', '190-210', '190-210', '190-210', '190-210', '190-210', '201-209'
$ em_unit       <str> 'GPa', 'GPa', 'GPa', 'GPa', 'GPa', 'GPa', 'GPa', 'GPa', 'GPa', 'GPa'
$ ys            <i64> 415, 490, 295, 580, 370, 325, 275, 485, 415, 1103
$ ys_unit       <str> 'MPa', 'MPa', 'MPa', 'MPa', 'MPa', 'MPa', 'MPa', 'MPa', 'MPa', 'MPa'
$ uts           <i64> 670, 635, 570, 690, 585,

In [56]:
df_scraped.select(['units_density', 'em_unit', "ys_unit", "uts_unit"]).unique()

units_density,em_unit,ys_unit,uts_unit
str,str,str,str
"""g/""","""GPa""","""MPa""","""MPa"""
"""g/cm3""","""GPa""","""MPa""","""MPa"""
"""g/cc""","""GPa""","""MPa""","""MPa"""


## All units are valid and ready to be added to header and unit column removed

All units in units_density are a 1:1 conversion with g/cm3

g/mL (water) = g/cc = g/cm3 = grams per cubic centimeter

Converns:
* emdash and hyphens present
* AISI-SAE 1065 did not separate from it's decimal number.
* AISI-SAE 1026 indicatse g/ but was originally "g/ cm3"

In [57]:
df_scraped_unit_header = df_scraped.select(['steel', 
                                            'density', 
                                            'pr', 
                                            'em', 
                                            'ys',
                                            'uts']).rename({'density' : 'density_g_per_cm3',
                                                                        'em' : 'em_GPa',
                                                                        'ys' : 'ys_MPa',
                                                                        'uts' : 'uts_MPa'})

print(df_scraped_unit_header.shape)    
df_scraped_unit_header.head()

(21, 6)


steel,density_g_per_cm3,pr,em_GPa,ys_MPa,uts_MPa
str,str,str,str,i64,i64
"""AISI-SAE 6150""","""7.85""","""0.27 – 0.30""","""190-210""",415,670
"""AISI-SAE 1065""","""7.85""","""0.27-0.30""","""200""",490,635
"""AISI-SAE 5140""","""7.85""","""0.27-0.30""","""190-210""",295,570
"""AISI-SAE 1050""","""7.85""","""0.27-0.30""","""190-210""",580,690
"""AISI-SAE 1035""","""7.85""","""0.27-0.30""","""190-210""",370,585


In [58]:
df_scraped_unit_header.describe()

statistic,steel,density_g_per_cm3,pr,em_GPa,ys_MPa,uts_MPa
str,str,str,str,str,f64,f64
"""count""","""21""","""21""","""21""","""21""",21.0,21.0
"""null_count""","""0""","""0""","""0""","""0""",0.0,0.0
"""mean""",,,,,508.47619,680.190476
"""std""",,,,,223.223569,200.888183
"""min""","""AISI 1030""","""7.7-8.03""","""0.27 – 0.30""","""190-210""",275.0,490.0
"""25%""",,,,,415.0,570.0
"""50%""",,,,,450.0,650.0
"""75%""",,,,,525.0,690.0
"""max""","""AISI-SAE 6150""","""7.87""","""0.29""","""201-209""",1165.0,1276.0


## Process ranges to an average value

### Helper functions for mapping

In [59]:
df_scraped_clean = df_scraped_unit_header.clone()


In [60]:
# string to number conversion that is aware all values arefrom sklearn.cluster import KMeans, AgglomerativeClustering
def strip_string(string):
    return string.strip()

def if_no_decimal_point(num):
    #If leading 0 followed by number then replace leading 0 with "0."
    regex = r"^0\d"
    if re.search(regex, num):
        #replace the 
        num = re.sub("^0", "0.", num)
    return np.float64(num)

def process_ranged_val_cleaned(col, df=df_scraped_clean):
    # AISI-SAE 5160 and AISI-SAE 6150 use a different dash than the others for the pr range
    # Discovered when 2 nulls showed up after processing
        # mean could not process the string and resulted in a null
    s_same_dashes = df[col].str.replace('–', '-')
    # Split the range froma string into two strings of numbers
    s_split = s_same_dashes.str.split('-')
    # Convert these decimal points to numbers 
        #aware that some decimal values are formatted 0## with no .
    s_numeric_l = s_split.map_elements(lambda x: list(map(if_no_decimal_point, x)))
    # take middle of the two values
    return s_numeric_l.list.mean()

# Process Units 

In [61]:
# Split the range froma string into two strings of numbers
s_split = df_scraped_clean['density_g_per_cm3'].str.split('-')
# Convert these decimal points to numbers 
s_numeric_l = s_split.map_elements(lambda x: list(map(if_no_decimal_point, x)))
# take middle of the two values
s_mean_d = s_numeric_l.list.mean()

  s_numeric_l = s_split.map_elements(lambda x: list(map(if_no_decimal_point, x)))


In [62]:
s_mean_p = process_ranged_val_cleaned("pr")
s_mean_em = process_ranged_val_cleaned("em_GPa")

  s_numeric_l = s_split.map_elements(lambda x: list(map(if_no_decimal_point, x)))


In [63]:
df_AZoM_clean = df_scraped_clean.with_columns(poissons_ratio = s_mean_p, 
                                         density_g_per_cm3 = s_mean_d, 
                                         elastic_modulus_GPa = s_mean_em,
                                         yield_strength_MPa = df_scraped_unit_header['ys_MPa'],
                                         ultimate_strength_MPA = df_scraped_unit_header['uts_MPa'])\
                                            .select(["steel",
                                                     "density_g_per_cm3",
                                                     "elastic_modulus_GPa",
                                                     "yield_strength_MPa",
                                                     "ultimate_strength_MPA"])
df_AZoM_clean.describe()

statistic,steel,density_g_per_cm3,elastic_modulus_GPa,yield_strength_MPa,ultimate_strength_MPA
str,str,f64,f64,f64,f64
"""count""","""21""",21.0,21.0,21.0,21.0
"""null_count""","""0""",0.0,0.0,0.0,0.0
"""mean""",,7.842048,200.47619,508.47619,680.190476
"""std""",,0.031154,1.503963,223.223569,200.888183
"""min""","""AISI 1030""",7.75,200.0,275.0,490.0
"""25%""",,7.85,200.0,415.0,570.0
"""50%""",,7.85,200.0,450.0,650.0
"""75%""",,7.85,200.0,525.0,690.0
"""max""","""AISI-SAE 6150""",7.87,205.0,1165.0,1276.0


# Drop columns that do not add information

Poisson's Ratio, Density, and Elastic Modulus are singletons. 

They do not add information. 

They are effecively constant down the entire database

In [64]:
df_AZoM_clean = df_AZoM_clean.select(['steel',
                                    "yield_strength_MPa",
                                    "ultimate_strength_MPA"])
df_AZoM_clean.describe()

statistic,steel,yield_strength_MPa,ultimate_strength_MPA
str,str,f64,f64
"""count""","""21""",21.0,21.0
"""null_count""","""0""",0.0,0.0
"""mean""",,508.47619,680.190476
"""std""",,223.223569,200.888183
"""min""","""AISI 1030""",275.0,490.0
"""25%""",,415.0,570.0
"""50%""",,450.0,650.0
"""75%""",,525.0,690.0
"""max""","""AISI-SAE 6150""",1165.0,1276.0


In [65]:
df_AZoM_clean.write_csv(f'{resources_path}/scraped_properties.csv')

## Scraping MakeItFrom.com
DuckDuckGo search used since the steels of question are split in two categories and MakeItFrom does not have a search function.

In [66]:
# https://www.makeitfrom.com/material-group/Wrought-Alloy-Steel-SAE-AISI
# OR
# https://www.makeitfrom.com/material-group/Wrought-Carbon-Or-Non-Alloy-Steel
base_url = "https://www.makeitfrom.com"
search_url = "https://duckduckgo.com/?hps=1&q="
search_suffix = "+site%3Amakeitfrom.com&atb=v427-1&ia=web"

In [67]:
s_AZOM_unsuccessful

steel
str
"""AISI-SAE 1042"""
"""AISI-SAE 1335"""
"""AISI 1049"""
"""AISI 4047"""
"""AISI-SAE 4037"""
"""AISI 52100"""
"""AISI-SAE 1080"""
"""AISI-SAE 9254"""


In [68]:
def handle_ranges(values_list):
    if values_list[1] == 'to':
        # map the range of values to a list of integers, then take the mean of it.
        value = np.mean(list(map(int, values_list[0:3:2])))
        units = values_list[3]
    else:
        value = np.float64(values_list[0])
        units = values_list[1]
    return value, units

In [69]:
def get_row(soup):
    mech_props = soup.find_all('div', class_='mech')
    # -2 is a magic number
    # the last 2 values are always tensile strength
    uts = 'no_label'
    uts_units = 'no_units'
    ys = 'no_label'
    ys_units = 'no_units'

    for div in mech_props[-2:]:
        ps = div.find_all('p')
        # get the property name
        label = ps[0].text
        # get the string of values and units and split it
        values = ps[1].text.split(' ')
        # Grab the metric value 
        value, units = handle_ranges(values)
        if 'uts' in label.lower():
            uts = value
            uts_units = units
        elif 'yield' in label.lower():
            ys = value
            ys_units = units
    row = (steel, ys, ys_units, uts, uts_units)
    return row

In [70]:
def handle_cold_drawn():
    # Browser is already on steel page
    soup = BeautifulSoup(browser.html, 'html.parser')
    cold_drawn_present = True
    steel_page = soup.find('div', class_='split links break-mid')
    # https://stackoverflow.com/questions/33404049/navigation-with-beautifulsoup
    
    cold_drawn = steel_page.find('a', string= lambda text: text \
                            #cold drawn matches values from AZoM for steels that are on both
                                and "Cold Drawn" in text\
                                # not "and" removed other processing in addtional to cold drawn if applicable
                                and not "and" in text) 
    try: 
        cold_drawn_page = cold_drawn.get('href')
        time.sleep(0.15)
        browser.visit(f'{base_url}{cold_drawn_page}')
    except AttributeError:
        # do not go to the cold drawn page if not present
        pass    

In [71]:

list_of_scraped = [("steel", "ys", "ys_unit", "uts", "uts_unit")] 
for steel in s_AZOM_unsuccessful:
    print("Working on:", steel)
    #search MIF for the steel
    time.sleep(0.15)
    browser.visit(f"{search_url}{steel}{search_suffix}")
    soup_search = BeautifulSoup(browser.html, 'html.parser')
    steel_link = soup_search.find('a',{'data-testid': "result-extras-url-link"}).get('href')
    
    time.sleep(0.15)
    browser.visit(steel_link)
    # handle cold drawn will land us on the desired steel page
    handle_cold_drawn()
    soup_results = BeautifulSoup(browser.html, "html.parser")
    list_of_scraped.append(get_row(soup_results))

browser.quit()

list_of_scraped

Working on: AISI-SAE 1042
Working on: AISI-SAE 1335
Working on: AISI 1049
Working on: AISI 4047
Working on: AISI-SAE 4037
Working on: AISI 52100
Working on: AISI-SAE 1080
Working on: AISI-SAE 9254


[('steel', 'ys', 'ys_unit', 'uts', 'uts_unit'),
 ('AISI-SAE 1042', 580.0, 'MPa', 700.0, 'MPa'),
 ('AISI-SAE 1335', 300.0, 'MPa', 550.0, 'MPa'),
 ('AISI 1049', 640.0, 'MPa', 750.0, 'MPa'),
 ('AISI 4047', 310.0, 'MPa', 580.0, 'MPa'),
 ('AISI-SAE 4037', 290.0, 'MPa', 540.0, 'MPa'),
 ('AISI 52100', 460.0, 'MPa', 1300.0, 'MPa'),
 ('AISI-SAE 1080', 535.0, 'MPa', 820.0, 'MPa'),
 ('AISI-SAE 9254', 410.0, 'MPa', 660.0, 'MPa')]

In [72]:
df_MIF = pl.DataFrame(list_of_scraped, orient='row')
df_MIF = df_MIF.rename(df_MIF.head(1).to_dicts().pop())[1:]
df_MIF

steel,ys,ys_unit,uts,uts_unit
str,str,str,str,str
"""AISI-SAE 1042""","""580""","""MPa""","""700""","""MPa"""
"""AISI-SAE 1335""","""300""","""MPa""","""550""","""MPa"""
"""AISI 1049""","""640""","""MPa""","""750""","""MPa"""
"""AISI 4047""","""310""","""MPa""","""580""","""MPa"""
"""AISI-SAE 4037""","""290""","""MPa""","""540""","""MPa"""
"""AISI 52100""","""460""","""MPa""","""1300""","""MPa"""
"""AISI-SAE 1080""","""535""","""MPa""","""820""","""MPa"""
"""AISI-SAE 9254""","""410""","""MPa""","""660""","""MPa"""


In [73]:
df_MIF_clean = df_MIF.select(['steel', 'ys', 'uts']).rename({'ys' : 'yield_strength_MPa',
                                                             'uts' : 'ultimate_strength_MPA'})
df_MIF_clean

steel,yield_strength_MPa,ultimate_strength_MPA
str,str,str
"""AISI-SAE 1042""","""580""","""700"""
"""AISI-SAE 1335""","""300""","""550"""
"""AISI 1049""","""640""","""750"""
"""AISI 4047""","""310""","""580"""
"""AISI-SAE 4037""","""290""","""540"""
"""AISI 52100""","""460""","""1300"""
"""AISI-SAE 1080""","""535""","""820"""
"""AISI-SAE 9254""","""410""","""660"""


In [74]:
df_AZoM_clean

steel,yield_strength_MPa,ultimate_strength_MPA
str,i64,i64
"""AISI-SAE 6150""",415,670
"""AISI-SAE 1065""",490,635
"""AISI-SAE 5140""",295,570
"""AISI-SAE 1050""",580,690
"""AISI-SAE 1035""",370,585
…,…,…
"""AISI-SAE 4340""",470,745
"""AISI-SAE 1030""",440,525
"""AISI-SAE 1040""",415,620
"""AISI 1030""",440,525


In [75]:
df_complete_props = pl.concat([df_AZoM_clean, df_MIF_clean], how='vertical_relaxed')
df_complete_props = df_complete_props.with_columns(pl.col('yield_strength_MPa').cast(pl.Int64),
                                       pl.col('ultimate_strength_MPA').cast(pl.Int64))
df_complete_props

steel,yield_strength_MPa,ultimate_strength_MPA
str,i64,i64
"""AISI-SAE 6150""",415,670
"""AISI-SAE 1065""",490,635
"""AISI-SAE 5140""",295,570
"""AISI-SAE 1050""",580,690
"""AISI-SAE 1035""",370,585
…,…,…
"""AISI 4047""",310,580
"""AISI-SAE 4037""",290,540
"""AISI 52100""",460,1300
"""AISI-SAE 1080""",535,820


In [76]:
df_complete_props.write_csv(f'{resources_path}/scraped_properties.csv')

# Join properties data with tempering data
Inner join to exclude the steels that are not searchable with the given information, or that do not conform to any known and accessable standards.

1286 rows after join

In [77]:
df_complete_props = df_complete_props.rename({'steel':'searchable'})
data = series_searchable_steel.join(df_complete_props, how="inner", on='searchable')

In [78]:
data.shape

(1286, 20)

# Select columns that have relevant data for the analysis

Exclude:
* Source data - does not provide information about the problem
* initial hardness - very incomplete data and no accurate way to fill it
* searchable -  contains more information than searchable, but is likely relipcated in elemental
  * Searchable contains fewer values, this grouping was used to scrape the strength values and contains no additional information
  * steel_type contains steel denotation from the papers. The informationa in this column is replicated in the elemental composition columns
  * PCA will handle this


In [79]:
df_relevent = data.drop(['source', 'searchable', 'initial_hardness_post_quenching_HRC'])
df_relevent.columns

['steel_type',
 'tempering_time_s',
 'tempering_temperature_C',
 'C',
 'Mn',
 'P',
 'S',
 'Si',
 'Ni',
 'Cr',
 'Mo',
 'V',
 'Al',
 'Cu',
 'final_hardness_post_tempering_HRC',
 'yield_strength_MPa',
 'ultimate_strength_MPA']

# Preprocessing the data for machine learning

In [80]:
target_columns = ['final_hardness_post_tempering_HRC']

X_prepre = df_relevent.drop(target_columns)

y_prepre = df_relevent.select(target_columns)

In [81]:
y_prepre.head(3)

final_hardness_post_tempering_HRC
f64
50.6
48.3
43.7


In [82]:
y_prepre.describe()

statistic,final_hardness_post_tempering_HRC
str,f64
"""count""",1286.0
"""null_count""",0.0
"""mean""",41.810653
"""std""",14.373455
"""min""",0.9
"""25%""",32.5
"""50%""",43.6
"""75%""",52.3
"""max""",68.5


# Preprocess target values

Target values are continuious.

TODO: determine preprocessing, if any. Test effects of scaling vs not.

In [83]:
y_scaler = StandardScaler()

y_scaler.fit(y_prepre)

y = y_scaler.transform(y_prepre)

# Preprocess data values

In [84]:
print(X_prepre.shape)
X_prepre.head(3)

(1286, 16)


steel_type,tempering_time_s,tempering_temperature_C,C,Mn,P,S,Si,Ni,Cr,Mo,V,Al,Cu,yield_strength_MPa,ultimate_strength_MPA
str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64
"""AISI-SAE 1026""",600,204.4,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,415,490
"""AISI-SAE 1026""",600,260.0,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,415,490
"""AISI-SAE 1026""",600,315.6,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,415,490


# Encode and scale variables
Most columns are numeric and range from 0.0# to 1000 so scaling is necessary
steel names are categorical and would benefit from one-hot encoding.

In [85]:
to_scale = X_prepre.drop('steel_type')
check_onehot = X_prepre.select('steel_type')

## One hot encode steel types

Encoding chosen because this is a categorical for a given material and testing suite

### Unify steel names so there is only 1 column per steel code

In [86]:
print(len(check_onehot.unique()))
# correct duplicate name 
dict_rename.update({'AISI-SAE 1030': "AISI 1030"})
to_onehot = check_onehot['steel_type']\
    .replace(dict_rename)\
    .str.strip_chars()
[print(val) for val in to_onehot.unique()]
print(len(to_onehot.unique()))

30
AISI 1074 Carbon Steel
AISI-SAE 1050
AISI-SAE 1045
AISI 6145
AISI-SAE 4140
AISI-SAE 1040
AISI-SAE 5140
AISI-SAE 6150
AISI-SAE 9254
AISI 4640
AISI-SAE 1042
AISI 52100
AISI-SAE 4037
AISI 1095
AISI 1090
AISI-SAE 4027
AISI 1055
AISI 1030
AISI-SAE 1026
AISI-SAE 4340
AISI-SAE 1038
AISI-SAE 1335
AISI 1049
AISI-SAE 1065
AISI-SAE 5160
AISI-SAE 1080
AISI 4047
AISI-SAE 1035
28


In [87]:
to_onehot = pl.DataFrame(to_onehot)
to_onehot.shape

(1286, 1)

In [88]:
# create the transformers
ohe = OneHotEncoder(handle_unknown='ignore')

oh_transformed = ohe.fit_transform(to_onehot)

# get the transformed column names
ohe_names = ohe.get_feature_names_out()

# 4 stack exchange methods tried. Then I made this one up ¯\_(ツ)_/¯
# Combinated of recenly changed API and polars I think, no way to rename all columns like in pandas.
columns = [f'column_{x}' for x in range(0, 30)]
rename_dict = dict(zip(columns, ohe_names))

df_ohe_encoded = pl.DataFrame(oh_transformed.toarray()).rename(rename_dict)
df_ohe_encoded.tail(3)

steel_type_AISI 1030,steel_type_AISI 1049,steel_type_AISI 1055,steel_type_AISI 1074 Carbon Steel,steel_type_AISI 1090,steel_type_AISI 1095,steel_type_AISI 4047,steel_type_AISI 4640,steel_type_AISI 52100,steel_type_AISI 6145,steel_type_AISI-SAE 1026,steel_type_AISI-SAE 1035,steel_type_AISI-SAE 1038,steel_type_AISI-SAE 1040,steel_type_AISI-SAE 1042,steel_type_AISI-SAE 1045,steel_type_AISI-SAE 1050,steel_type_AISI-SAE 1065,steel_type_AISI-SAE 1080,steel_type_AISI-SAE 1335,steel_type_AISI-SAE 4027,steel_type_AISI-SAE 4037,steel_type_AISI-SAE 4140,steel_type_AISI-SAE 4340,steel_type_AISI-SAE 5140,steel_type_AISI-SAE 5160,steel_type_AISI-SAE 6150,steel_type_AISI-SAE 9254
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Scale numerical columns 
This will ensure the small elemental composition values are not overshadowed by the large strength properties 

In [89]:
print(type(to_scale))
to_scale.tail(3)

<class 'polars.dataframe.frame.DataFrame'>


tempering_time_s,tempering_temperature_C,C,Mn,P,S,Si,Ni,Cr,Mo,V,Al,Cu,yield_strength_MPa,ultimate_strength_MPA
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64
86400,500.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,525,685
86400,600.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,525,685
86400,700.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,525,685


In [90]:
scaler = StandardScaler()

X_scaler = scaler.fit(to_scale)

scaled = scaler.transform(to_scale)

scaler_names = scaler.get_feature_names_out()
columns = [f'column_{x}' for x in range(0, 14)]
rename_dict = dict(zip(columns, scaler_names))

df_scaled = pl.DataFrame(scaled).rename(rename_dict)
df_scaled.tail(3)


tempering_time_s,tempering_temperature_C,C,Mn,P,S,Si,Ni,Cr,Mo,V,Al,Cu,yield_strength_MPa,column_14
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.89775,0.467987,2.69532,-0.618793,-0.60703,-0.394811,-0.591857,-0.392117,-0.791743,-0.638219,-0.201129,0.0,-0.337311,0.188703,-0.105424
1.89775,1.028689,2.69532,-0.618793,-0.60703,-0.394811,-0.591857,-0.392117,-0.791743,-0.638219,-0.201129,0.0,-0.337311,0.188703,-0.105424
1.89775,1.589391,2.69532,-0.618793,-0.60703,-0.394811,-0.591857,-0.392117,-0.791743,-0.638219,-0.201129,0.0,-0.337311,0.188703,-0.105424


## Merge one hot encoded variables with scaled numeric variables

In [91]:
X = pl.concat([df_ohe_encoded, df_scaled], how='horizontal')
X

steel_type_AISI 1030,steel_type_AISI 1049,steel_type_AISI 1055,steel_type_AISI 1074 Carbon Steel,steel_type_AISI 1090,steel_type_AISI 1095,steel_type_AISI 4047,steel_type_AISI 4640,steel_type_AISI 52100,steel_type_AISI 6145,steel_type_AISI-SAE 1026,steel_type_AISI-SAE 1035,steel_type_AISI-SAE 1038,steel_type_AISI-SAE 1040,steel_type_AISI-SAE 1042,steel_type_AISI-SAE 1045,steel_type_AISI-SAE 1050,steel_type_AISI-SAE 1065,steel_type_AISI-SAE 1080,steel_type_AISI-SAE 1335,steel_type_AISI-SAE 4027,steel_type_AISI-SAE 4037,steel_type_AISI-SAE 4140,steel_type_AISI-SAE 4340,steel_type_AISI-SAE 5140,steel_type_AISI-SAE 5160,steel_type_AISI-SAE 6150,steel_type_AISI-SAE 9254,tempering_time_s,tempering_temperature_C,C,Mn,P,S,Si,Ni,Cr,Mo,V,Al,Cu,yield_strength_MPa,column_14
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.617861,-1.189448,-1.155898,0.167872,-0.60703,0.268364,-0.513467,-0.392117,-0.813526,-0.638219,-0.201129,0.0,-0.337311,-0.342255,-1.009596
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.617861,-0.877697,-1.155898,0.167872,-0.60703,0.268364,-0.513467,-0.392117,-0.813526,-0.638219,-0.201129,0.0,-0.337311,-0.342255,-1.009596
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.617861,-0.565947,-1.155898,0.167872,-0.60703,0.268364,-0.513467,-0.392117,-0.813526,-0.638219,-0.201129,0.0,-0.337311,-0.342255,-1.009596
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.617861,-0.254758,-1.155898,0.167872,-0.60703,0.268364,-0.513467,-0.392117,-0.813526,-0.638219,-0.201129,0.0,-0.337311,-0.342255,-1.009596
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.617861,0.056993,-1.155898,0.167872,-0.60703,0.268364,-0.513467,-0.392117,-0.813526,-0.638219,-0.201129,0.0,-0.337311,-0.342255,-1.009596
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.89775,-0.653417,2.69532,-0.618793,-0.60703,-0.394811,-0.591857,-0.392117,-0.791743,-0.638219,-0.201129,0.0,-0.337311,0.188703,-0.105424
0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.89775,-0.092715,2.69532,-0.618793,-0.60703,-0.394811,-0.591857,-0.392117,-0.791743,-0.638219,-0.201129,0.0,-0.337311,0.188703,-0.105424
0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.89775,0.467987,2.69532,-0.618793,-0.60703,-0.394811,-0.591857,-0.392117,-0.791743,-0.638219,-0.201129,0.0,-0.337311,0.188703,-0.105424
0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.89775,1.028689,2.69532,-0.618793,-0.60703,-0.394811,-0.591857,-0.392117,-0.791743,-0.638219,-0.201129,0.0,-0.337311,0.188703,-0.105424


In [92]:
X.write_csv(f"{resources_path}/X_nopreproc.csv")

pl.DataFrame(y).write_csv(f"{resources_path}/y_nopreproc.csv")


## Modelling for data preprocess

PCA modelling used to reduce X dimensions from 44 to 12 maintaining 95.7% of the explained variance.

KBinsDiscretizer used to bin imbalanced Y target. 15 bins, where half have less than 5 support reduced to 5 similarly supported bins

### PCA analysis

Introduction of two strength metrics certainly introduced multcolinearity, with eachother and with the elemental composision, some of whicha re indicators of strength.

PCA analysis will be done to alleiviate these factors.

In [93]:
X = pl.read_csv(f"{resources_path}/X_nopreproc.csv")

In [94]:
pca_explained_sum = {'x': [], 'pca_explained_var': []}
pca_explained_min = {'x': [], 'pca_explained_min_component': []}
for x in range(3,30):
    pca = PCA(n_components=x ,random_state=random_state)

    pca.fit(X)

    pca_explained_sum['x'].append(x)
    sum_explained = np.sum(pca.explained_variance_ratio_)
    pca_explained_sum['pca_explained_var'].append(sum_explained)

    pca_explained_min['x'].append(x)
    min_explained = np.min(pca.explained_variance_ratio_)
    pca_explained_min['pca_explained_min_component'].append(min_explained)

In [95]:
df_pca_explained = pl.DataFrame(pca_explained_sum)

### PCA component number determination

In [96]:
alt.Chart(df_pca_explained,
          title= "Total Explained Variance vs Number of PCA Features").mark_line().encode(
    alt.X('x:Q').title("PCA X"),
    alt.Y('pca_explained_var:Q')\
        .scale(domain=(0.5,1))\
            .title("Total explained variance")
)

### Value of increasing PCA components decreases between 10 and 15

In [97]:
df_pca_explained_min = pl.DataFrame(pca_explained_min)
df_pca_explained_min.filter(pl.col('x').le(15) & pl.col('x').ge(10))

x,pca_explained_min_component
i64,f64
10,0.037895
11,0.025717
12,0.015481
13,0.01162
14,0.006842
15,0.003192


### PCA component number 13 selected
Component 13 has half the explained ratio contribution as component 13, and component 14 decreased by a factor of 10

13 components has explains 96.2% of the variance.

In [98]:
df_pca_explained.filter(pl.col("x").eq(13))

x,pca_explained_var
i64,f64
13,0.961645


In [99]:
pca = PCA(n_components= 13, random_state=random_state)

pca.fit(X)

X_pca = pca.transform(X)

df_pca = pl.DataFrame(X_pca)
df_pca.head(3)

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-1.396549,0.357322,0.11312,-0.35176,-0.475294,-0.085418,-1.457699,0.792878,-0.123649,-0.967397,-0.402453,0.138054,0.316879
-1.408114,0.303587,0.136813,-0.349753,-0.573852,-0.23484,-1.270245,0.633357,-0.123784,-0.940428,-0.405776,0.141606,0.304695
-1.419679,0.249852,0.160507,-0.347746,-0.67241,-0.384263,-1.082791,0.473836,-0.123919,-0.913459,-0.409099,0.145159,0.292511


In [100]:
df_pca.describe()

statistic,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",1286.0,1286.0,1286.0,1286.0,1286.0,1286.0,1286.0,1286.0,1286.0,1286.0,1286.0,1286.0,1286.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",-3.8677e-17,1.9338e-17,-8.9785e-17,8.5641e-17,3.3151e-17,2.7626e-18,3.177e-17,5.5252e-18,-4.4202e-17,-4.282e-17,-1.2432e-16,1.6576e-17,-1.6576e-17
"""std""",1.653448,1.572193,1.326167,1.210351,1.076369,1.003431,0.975282,0.911914,0.811145,0.753117,0.620412,0.481368,0.417029
"""min""",-2.755651,-2.749125,-2.631238,-2.526038,-3.325919,-1.580944,-2.390954,-2.977647,-1.634804,-1.47535,-1.006469,-0.940166,-0.97989
"""25%""",-1.107186,-1.236004,-0.652254,-0.596507,-0.673487,-0.753635,-0.706409,-0.567779,-0.509937,-0.442693,-0.411362,-0.270313,-0.360951
"""50%""",-0.188674,-0.121004,-0.113746,-0.187207,-0.016828,-0.195364,-0.037295,-0.023571,0.066648,-0.027211,-0.168751,0.012771,0.094007
"""75%""",0.473879,1.091142,0.54757,0.852972,0.593213,0.34491,0.575031,0.474041,0.529772,0.249033,0.422331,0.249639,0.301865
"""max""",4.895881,3.739467,4.450363,3.646042,4.063784,3.125609,4.122047,3.998631,1.510213,2.563479,1.49625,1.012016,0.760051


## Check data balance before train-test-split
Balancing the target is unnecessary as it is a continuous variable. 

It is not effected by imbalance in the same way a categorical target is.

In [101]:
df_pca.write_csv(f'{resources_path}/df_X.csv')
pl.DataFrame(y).write_csv(f'{resources_path}/df_y.csv')


# Modelling

In [121]:
X = pl.read_csv(f'{resources_path}/df_X.csv')
y = pl.read_csv(f'{resources_path}/df_y.csv')
# Polars does not always work well with Keras
X = X.to_numpy()
y = y.to_numpy()

## Test train split

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=random_state)

## Gradient boost regression

The final hardness is a continuos varaible, that may be non-linear with respect to the supporting features. 

### This model does perform well.

I will attmpt to use neural network modelling to see if it can be improved, and to get history.


In [123]:
gb_regressor = GradientBoostingRegressor(random_state=random_state)

gb_regressor.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


In [124]:
y_pred = gb_regressor.predict(X_test)
y_pred.shape

(322,)

In [126]:
# match y_test shape to model output shape
def reshape_y_for_eval(y_test):
    return y_test.reshape(1,-1)[0]

In [127]:
print("R2 score:", round(gb_regressor.score(X_test, reshape_y_for_eval(y_test)),3))

R2 score: 0.928


### Invert the y_scaling done prior to modelling for the plotting 

In [128]:
def y_inverse_transform(y_data):
    result = y_scaler.inverse_transform(y_data.reshape(-1, 1)).reshape(1,-1)[0]
    return result

In [129]:
y_test_invscal = y_inverse_transform(y_test)
y_pred_invscal = y_inverse_transform(y_pred)

In [130]:
test_df = pl.DataFrame({'Real Final Hardness (HRC)' : y_test_invscal,
                        'Predicted Final Hardness (HRC)' : y_pred_invscal})
test_df.head(3)

Real Final Hardness (HRC),Predicted Final Hardness (HRC)
f64,f64
57.5,56.934392
60.4,60.633832
47.5,45.07313


In [245]:
base = alt.Chart(test_df).mark_point().encode(
    alt.X('Real Final Hardness (HRC)'),
    alt.Y('Predicted Final Hardness (HRC)')
)

fit = base.transform_regression(
    'Real Final Hardness (HRC)','Predicted Final Hardness (HRC)', method='linear'
).mark_line(color='red').encode()

plot = (base + fit).properties(title='GBRegression Fit')
plot.save(f"{images_path}/GBR_fit.png")
plot

## Gradient boost regression as a good fit

R2 score is 0.92 without scaling. This is quite good.

R2 score is 0.93 with y-scaling. Marginally better.

Moving to a Deep Neural Network model to try and improve performance. 

# Deep Neural Network Model

In [132]:
dim_input = X_test.shape[1]
dim_output = y.shape[1]
print("Xdim", dim_input,"| ydim", dim_output)

Xdim 13 | ydim 1


In [186]:
# https://machinelearningmastery.com/multi-label-classification-with-deep-learning/

# input
layers = [Dense(16, activation='relu'),
          # fork model to predict hardness
          Dense(units=16, activation='relu'),
        #Dense(units=64, activation='relu'),
        #Dense(units=64, activation='relu'),
          # Output layers for each label
          Dense(dim_output, name='Hardness')
]

nn_model = Sequential(layers)

nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.0005),
                 loss='mean_absolute_error',
                 metrics=['mse'])

nn_history = nn_model.fit(x=X_train, y=y_train, 
                        verbose=0, 
                        epochs=100,
                        validation_split = 0.2)


In [187]:
nn_model.summary()

## Evaluate NN model

Fit and model done 5 times to get stability from the accuracies generated per model.

In [188]:
result = nn_model.evaluate(X_test, y_test, return_dict=True)
result

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 736us/step - loss: 0.1967 - mse: 0.0803


{'loss': 0.1886691302061081, 'mse': 0.07480241358280182}

In [282]:
history_df = pl.DataFrame(nn_history.history)
history_df = history_df.with_row_index()
history_df.head(3)

index,loss,mse,val_loss,val_mse
u32,f64,f64,f64,f64
0,0.824628,1.051932,0.794238,0.971708
1,0.751501,0.917164,0.731762,0.876947
2,0.696253,0.825915,0.680474,0.80577


In [283]:
def plot_model_history(df, name='DNN', feature='mse', tuner=False):
    title = f'{feature}'.title() if feature != 'mse' else "Mean Squared Error"
    selector = ['index', feature]
    if tuner: selector.append(f'val_{feature}')

    df_reduced = df.select(selector)

    plot = alt.Chart(df_reduced).mark_line().encode(
        alt.X('index').title('Epochs'),
        alt.Y(alt.repeat('layer'), type='quantitative').\
            title(title),
        color = alt.datum(alt.repeat('layer'))
    ).repeat(
        layer = selector[1:]
    ).properties(title=f"{title} by Epoch")

    plot.show()

    plot.save(f"{images_path}/{name}_{feature}.png")


In [284]:
plot_model_history(history_df, feature='mse', name="initial_NN", tuner=True)
plot_model_history(history_df, feature='loss', name="initial_NN", tuner=True)

# Results

NN model pushes R2 down to 0.5 with very few epochs (<25).

# Build a Tuner

In [258]:
name = "DNN_tuner"
def create_hp_model(hp):
    #Activation for all hidden layers
      activation = hp.Choice('activation',['relu','tanh','sigmoid'])

      layers = [Dense(units=hp.Int('time_units1',
                             min_value=8,
                             max_value=512,
                             sampling='linear'), 
                             activation=activation),
                             # fork model to predict hardness
                  Dense(units=hp.Int('time_units2',
                             min_value=8,
                             max_value=512,
                             sampling='linear'), 
                             activation=activation),
                  Dense(units=hp.Int('time_units3',
                             min_value=8,
                             max_value=512,
                             sampling='linear'), 
                             activation=activation),
                  Dense(units=hp.Int('time_units4',
                             min_value=8,
                             max_value=512,
                             sampling='linear'), 
                             activation=activation),
                  Dense(dim_output, name='Hardness')
          ]
    
      nn_model = Sequential(layers)

      nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.01),
                 loss='mean_absolute_error',
                 metrics=['mse'])
    
      return nn_model

In [259]:
tuner = kt.Hyperband(
    create_hp_model,
    objective=["val_loss","val_loss"],
    max_epochs=20,
    factor=5,
    hyperband_iterations=2,
    project_name=name,
    executions_per_trial=5,
    overwrite= True)# TODO: remove when done tuning this set and have saved DNN to github after model freeze


In [260]:
tuner.search(x=X_train, 
             y=y_train,
             epochs=250,
             validation_data=(X_test,y_test))

Trial 26 Complete [00h 00m 32s]
multi_objective: 0.20622342228889465

Best multi_objective So Far: 0.15602429509162902
Total elapsed time: 00h 10m 50s


In [266]:
best_model = tuner.get_best_models(1)[0]

In [267]:
fit_history = best_model.fit(x=X_train, 
               y=y_train, 
               verbose=0,
               epochs=50)

In [268]:
tuner.get_best_hyperparameters(1)[0].values

{'activation': 'relu',
 'time_units1': 163,
 'time_units2': 24,
 'time_units3': 471,
 'time_units4': 464,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 4,
 'tuner/bracket': 1,
 'tuner/round': 1,
 'tuner/trial_id': '0017'}

In [269]:
plot_model_history(history_df, feature='mse', name=name, tuner=True)
plot_model_history(history_df, feature='loss', name=name, tuner=True)

# Optimizing optimizer

In [273]:

def create_opt_model(opt):
    #Activation for all hidden layers

    layers = [Dense(units=164, 
                    activation='relu'),
                Dense(units=24, 
                      activation='relu'),
                Dense(dim_output, name='Hardness')
          ]
    
    nn_model = Sequential(layers)

    nn_model.compile(optimizer=opt,
                 loss='mean_absolute_error',
                 metrics=['mse'])
    
    return nn_model

In [274]:
optimizers = [name.lower() for name in ["Adadelta", 
                                        "Adafactor", 
                                        "Adagrad", 
                                        "Adam", 
                                        "AdamW", 
                                        "Adamax", 
                                        "Ftrl", 
                                        "Lion", 
                                        "Nadam", 
                                        "RMSprop", 
                                        "SGD"]]

In [275]:
for opt in optimizers:
    best_opt = 'starter'
    best_mse = 4

    model = create_opt_model(opt)
    model.fit(x=X_train, 
            y=y_train,
            verbose=0,
            epochs=150)
    
    loss, mse = nn_model.evaluate(X_test, y_test)
    
    if mse < best_mse:
        best_mse = mse
        best_opt = opt

print("Best Optimizer:", best_opt, "with MSE", best_mse)

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1967 - mse: 0.08033
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 611us/step - loss: 0.1954 - mse: 0.0796
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 446us/step - loss: 0.1967 - mse: 0.0803
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 489us/step - loss: 0.1967 - mse: 0.0803
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 525us/step - loss: 0.1960 - mse: 0.0797
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 491us/step - loss: 0.1967 - mse: 0.0803
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 599us/step - loss: 0.1967 - mse: 0.0803
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 563us/step - loss: 0.1967 - mse: 0.0803
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 539us/step - loss: 0.1967 - mse: 0.0803
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6

## Epoch optimization

In [304]:
model = create_opt_model(tf.keras.optimizers.SGD(0.01))
fit = model.fit(x=X_train, 
         y=y_train, 
         verbose=0,
         epochs=100)



In [305]:
name= 'epoch_test'
history_df = pl.DataFrame(fit.history)
history_df = history_df.with_row_index()
history_df
plot_model_history(history_df, feature='mse', name=name)
plot_model_history(history_df, feature='loss', name=name)

# Model Selected, performance is poor

In [306]:
model = create_opt_model(opt)
fit = model.fit(x=X_train, 
        y=y_train,
        verbose=0,
        epochs=25)
name = 'final NN'
history_df = pl.DataFrame(fit.history)
history_df = history_df.with_row_index()
history_df
plot_model_history(history_df, feature='mse', name=name)
plot_model_history(history_df, feature='loss', name=name)