In [2]:
import polars as pl
import pathlib
import re

import altair as alt

# Loading Data

In [3]:
cwd = pathlib.Path.cwd()

if cwd.name == 'Mild-Steel-Tempering':
    print("Path is project root")
else:
    print("Please correct current working directory to the project root")


Please correct current working directory to the project root


In [4]:
resources_path = pathlib.PurePath(pathlib.PurePath(cwd).parent, 'resources')
resources_path

PurePosixPath('/home/mox/Documents/coding_projects/bootcamp_local/Homeworks/Mild-Steel-Tempering/resources')

### Many alloy composition columns were parsed incorrectly and failing to load

Several weight percent columns were parsed as int automatically due to having "0" for many initial rows. 

All weight percent columns should be parsed as float. 

In [5]:
data_path = f"{resources_path}/Raiipa-tempering-data.csv"
schema_overrides = {"C (%wt)" : pl.Float64,
"Mn (%wt)" : pl.Float64,
"P (%wt)" : pl.Float64,
"S (%wt)" : pl.Float64,
"Si (%wt)" : pl.Float64,
"Ni (%wt)" : pl.Float64,
"Cr (%wt)" : pl.Float64,
"Mo (%wt)" : pl.Float64,
"V (%wt)" : pl.Float64,
"Al (%wt)" : pl.Float64,
"Cu (%wt)" : pl.Float64}

df_data = pl.read_csv(data_path, schema_overrides=schema_overrides)
df_data.glimpse()

Rows: 1466
Columns: 17
$ Source                                  <str> 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956', 'Grange and Baughman, 1956'
$ Steel type                              <str> 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026', 'AISI-SAE 1026'
$ Initial hardness (HRC) - post quenching <str> '?', '?', '?', '?', '?', '?', '?', '?', '?', '?'
$ Tempering time (s)                      <i64> 600, 600, 600, 600, 600, 600, 600, 600, 600, 600
$ Tempering temperature (ºC)              <f64> 204.4, 260.0, 315.6, 371.1, 426.7, 482.2, 537.8, 593.3, 648.9, 704.4
$ C (%wt)                                 <f64> 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25
$ Mn 

In [6]:
# save initial columns names as they are very descriptive and may be useful later
initial_column_names = df_data.columns
initial_column_names

['Source',
 'Steel type',
 'Initial hardness (HRC) - post quenching',
 'Tempering time (s)',
 'Tempering temperature (ºC)',
 'C (%wt)',
 'Mn (%wt)',
 'P (%wt)',
 'S (%wt)',
 'Si (%wt)',
 'Ni (%wt)',
 'Cr (%wt)',
 'Mo (%wt)',
 'V (%wt)',
 'Al (%wt)',
 'Cu (%wt)',
 'Final hardness (HRC) - post tempering']

# Cleaning
Many columns need renaming for ease of manipulation

Columns need datatypes correction
 'Initial hardness (HRC) - post quenching' needs datatype correction. ? is NA value


In [7]:
dict_new_cnames = {}
for og_name in initial_column_names:
    # replace filler in hardness columns
    new_name = og_name.replace(' - ', '')
    
    # Handle units
    try:
        #if alloy weight percent remove units and return only elemental symbol
        if re.search(r"\(%wt\)", og_name):
            new_name = og_name.split(' ')[0]
            #skip the rest of the try block that will re_add the units to the end
            pass
        else: 
            #If not elemental composition, lowercase the string
            new_name = new_name.lower()
        # regex find the units inside the parenthesis, of the original name, not the lowercased new name
        # This lines breaks and goes to except if there is no units
        unit = re.search(r'\((\w+)\)', og_name).group(1)
        # replace the unit parenthesis string with parenthesis with an empty string
        new_name = re.sub(r"\(.+\)", "", new_name)
        # trim to whitespace end characters left by some unit removals
        new_name = new_name.rstrip()
        # append the unit string to the end of the processed name
        new_name = f"{new_name}_{unit}"
    except:
        # skip unit processing on names with no units denoted by parenthesis
        pass

    # Strip away special characters
    new_name = new_name.encode("ascii", errors="ignore").decode()
    #replace all whitespace with underscores
    new_name = new_name.replace(' ', '_')
    #add the name to the rename dict
    dict_new_cnames[og_name] = new_name
dict_new_cnames


{'Source': 'source',
 'Steel type': 'steel_type',
 'Initial hardness (HRC) - post quenching': 'initial_hardness_post_quenching_HRC',
 'Tempering time (s)': 'tempering_time_s',
 'Tempering temperature (ºC)': 'tempering_temperature_C',
 'C (%wt)': 'C',
 'Mn (%wt)': 'Mn',
 'P (%wt)': 'P',
 'S (%wt)': 'S',
 'Si (%wt)': 'Si',
 'Ni (%wt)': 'Ni',
 'Cr (%wt)': 'Cr',
 'Mo (%wt)': 'Mo',
 'V (%wt)': 'V',
 'Al (%wt)': 'Al',
 'Cu (%wt)': 'Cu',
 'Final hardness (HRC) - post tempering': 'final_hardness_post_tempering_HRC'}

In [8]:
df_clean_cnames = df_data.rename(dict_new_cnames)

## Clean data types and column values

In [9]:
count_of_qmark = df_clean_cnames['initial_hardness_post_quenching_HRC'].value_counts()\
    .filter(pl.col('initial_hardness_post_quenching_HRC') == "?")\
        .select("count").item()


In [10]:
percent_intial_hardness_unknown = count_of_qmark / int(df_clean_cnames['initial_hardness_post_quenching_HRC'].shape[0])
print(f"Unknown initial hardness: {round(percent_intial_hardness_unknown, 2) * 100:.0f}%")


Unknown initial hardness: 65%


In [11]:
df_clean_cnames

source,steel_type,initial_hardness_post_quenching_HRC,tempering_time_s,tempering_temperature_C,C,Mn,P,S,Si,Ni,Cr,Mo,V,Al,Cu,final_hardness_post_tempering_HRC
str,str,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Grange and Baughman, 1956""","""AISI-SAE 1026""","""?""",600,204.4,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,50.6
"""Grange and Baughman, 1956""","""AISI-SAE 1026""","""?""",600,260.0,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,48.3
"""Grange and Baughman, 1956""","""AISI-SAE 1026""","""?""",600,315.6,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,43.7
"""Grange and Baughman, 1956""","""AISI-SAE 1026""","""?""",600,371.1,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,40.5
"""Grange and Baughman, 1956""","""AISI-SAE 1026""","""?""",600,426.7,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,37.3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""","""64.5""",86400,300.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,50.5
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""","""64.5""",86400,400.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,42.0
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""","""64.5""",86400,500.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,32.0
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""","""64.5""",86400,600.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,23.0


In [12]:
df_clean_cnames["initial_hardness_post_quenching_HRC"].value_counts().sort('count', descending=True).head(3)

initial_hardness_post_quenching_HRC,count
str,u32
"""?""",949
"""66.5""",90
"""61.6""",51


In [13]:
df_clean = df_clean_cnames.with_columns(pl.col('initial_hardness_post_quenching_HRC').cast(pl.Float64, strict=False))

In [14]:
df_clean["initial_hardness_post_quenching_HRC"].value_counts().sort('count', descending=True).head(3)

initial_hardness_post_quenching_HRC,count
f64,u32
,949
66.5,90
63.3,51


In [15]:
df_clean

source,steel_type,initial_hardness_post_quenching_HRC,tempering_time_s,tempering_temperature_C,C,Mn,P,S,Si,Ni,Cr,Mo,V,Al,Cu,final_hardness_post_tempering_HRC
str,str,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Grange and Baughman, 1956""","""AISI-SAE 1026""",,600,204.4,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,50.6
"""Grange and Baughman, 1956""","""AISI-SAE 1026""",,600,260.0,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,48.3
"""Grange and Baughman, 1956""","""AISI-SAE 1026""",,600,315.6,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,43.7
"""Grange and Baughman, 1956""","""AISI-SAE 1026""",,600,371.1,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,40.5
"""Grange and Baughman, 1956""","""AISI-SAE 1026""",,600,426.7,0.25,0.79,0.012,0.026,0.11,0.0,0.0,0.0,0.0,0.0,0.0,37.3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""",64.5,86400,300.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,50.5
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""",64.5,86400,400.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,42.0
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""",64.5,86400,500.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,32.0
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""",64.5,86400,600.0,1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0,23.0


# Making steel types values searchable
Steel types that do not include AISI and a code are not searchable in steel databased and need manual renaming
Exmaple: I"0,74%C - plain carbon steel" is a AISI 1074 steel. Confirmed via elemental composition on https://www.azom.com/article.aspx?ArticleID=6558


In [16]:
steel_identifiers = ["steel_type", "source"]
df_clean[steel_identifiers].unique().shape

(36, 2)

In [17]:
not_searchable = df_clean[steel_identifiers].unique().filter(pl.col('steel_type').str.contains("AISI").not_())


In [18]:
df_not_searchable = df_clean.join(not_searchable, on=steel_identifiers, how="inner")


In [19]:
df_not_searchable_alloy_ele = df_not_searchable.select(["source", "steel_type", "C", "Mn", "P", "S", "Si", "Ni", "Cr", "Mo", "V", "Al", "Cu"]).unique()
df_not_searchable_alloy_ele

source,steel_type,C,Mn,P,S,Si,Ni,Cr,Mo,V,Al,Cu
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Hollomon and Jaffe, 1945""","""0,56%C - plain carbon steel""",0.56,0.54,0.012,0.015,0.18,0.0,0.02,0.0,0.0,0.0,0.0
"""Hollomon and Jaffe, 1945""","""0,98%C - plain carbon steel""",0.98,0.3,0.007,0.021,0.3,0.0,0.03,0.0,0.0,0.0,0.055
"""Hollomon and Jaffe, 1945""","""1,15%C - plain carbon steel""",1.15,0.58,0.012,0.021,0.09,0.0,0.01,0.0,0.0,0.0,0.0
"""Hollomon and Jaffe, 1945""","""0,74%C - plain carbon steel""",0.74,0.66,0.009,0.021,0.18,0.0,0.01,0.0,0.0,0.0,0.0
"""Hollomon and Jaffe, 1945""","""0,89%C - plain carbon steel""",0.89,0.55,0.012,0.02,0.06,0.0,0.01,0.0,0.0,0.0,0.08
"""Grange and Baughman, 1956""","""Nitriding Steel """,0.41,0.57,0.017,0.005,0.24,0.17,1.57,0.36,0.0,1.26,0.0
"""Hollomon and Jaffe, 1945""","""0,31%C - plain carbon steel""",0.31,0.52,0.007,0.026,0.1,0.0,0.015,0.0,0.0,0.0,0.06


In [20]:
#1945 source...
# I can not access source for the 1945 paper on these
# I assume they alloying elements are a bit strange due to wartime shortages
# and potentially contamination between alloys due to the push to increasew prodiction
dict_rename = {'0,98%C - plain carbon steel': 'AISI 1095', # copper and Cr, probably intentional?
                   '1,15%C - plain carbon steel': 'AISI 1095', # small chromium impurity?
                   '0,74%C - plain carbon steel': 'AISI 1074 Carbon Steel', 
                   '0,56%C - plain carbon steel': 'AISI 1055', # small chromium impurity?
                   '0,89%C - plain carbon steel': 'AISI 1090', # Cu discounted as impurity
                   'Nitriding Steel ': 'Non-searchable', 
                   '0,31%C - plain carbon steel': 'AISI 1030', # AISI 1030, Cr impurity
                   "AISI-SAE 9264" : 'AISI-SAE 9254', # access limited, may be AISI-SAE 9264 , not in AZoM
                   "AISI-SAE 2340" : 'Non-searchable', # access limited, may be SAE J2340, not in AZoM
                   "AISI-SAE 3140" : 'Non-searchable', # Not in AZoM elemental match to SAE 3140 https://www.steel-grades.com/metals/18/5155/-SAE-3140.html
                   "AISI-SAE 4068" : 'Non-searchable', # access limited, may be SAE 4068, not in AZoM
                   "AISI-SAE 4640" : "AISI 4640",
                   "AISI-SAE 4047" : "AISI 4047",
                   "AISI-SAE 1049" : "AISI 1049",
                   "AISI-SAE 6145" : "AISI 6145",
                   "AISI-SAE E52100" : "AISI 52100"} # # Not in AZoM elemental match to  SAE 4068https://www.steel-grades.com/Steel-Grades/Carbon-Steel/SAE-4068-.html

In [21]:
# dict_rename_bad.keys()

In [22]:
series_searchable_steel = df_clean['steel_type'].replace(dict_rename).unique()
series_searchable_steel = series_searchable_steel.filter(series_searchable_steel.eq('Non-searchable').not_())
series_searchable_steel = series_searchable_steel.rename('searchable')
series_searchable_steel

searchable
str
"""AISI-SAE 1026"""
"""AISI 52100"""
"""AISI-SAE 5140"""
"""AISI-SAE 1080"""
"""AISI 1055"""
…
"""AISI 1030"""
"""AISI 1090"""
"""AISI-SAE 1335"""
"""AISI 1049"""


In [23]:
df_clean["tempering_time_s", "tempering_temperature_C"].unique() 
# May need to bin tempering temps
# random forest multi output classifier as baseline
# use multiclassification NN as improvement

tempering_time_s,tempering_temperature_C
i64,f64
600,648.9
300,648.9
9000,406.0
14400,704.4
3600,315.6
…,…
3600,704.4
600,371.1
79200,260.0
14400,648.9


In [24]:
# path_save_searchable = f'{resources_path}/searchable_steels.csv'
# pl.DataFrame(series_searchable_steel).write_csv(path_save_searchable)

In [25]:
df_clean.filter(pl.col('steel_type').str.contains('4640'))

source,steel_type,initial_hardness_post_quenching_HRC,tempering_time_s,tempering_temperature_C,C,Mn,P,S,Si,Ni,Cr,Mo,V,Al,Cu,final_hardness_post_tempering_HRC
str,str,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Grange and Baughman, 1956""","""AISI-SAE 4640""",,40,426.7,0.36,0.63,0.018,0.021,0.19,1.84,0.06,0.23,0.0,0.0,0.0,45.2
"""Grange and Baughman, 1956""","""AISI-SAE 4640""",,40,537.8,0.36,0.63,0.018,0.021,0.19,1.84,0.06,0.23,0.0,0.0,0.0,39.2
"""Grange and Baughman, 1956""","""AISI-SAE 4640""",,40,593.3,0.36,0.63,0.018,0.021,0.19,1.84,0.06,0.23,0.0,0.0,0.0,36.7
"""Grange and Baughman, 1956""","""AISI-SAE 4640""",,40,648.9,0.36,0.63,0.018,0.021,0.19,1.84,0.06,0.23,0.0,0.0,0.0,33.5
"""Grange and Baughman, 1956""","""AISI-SAE 4640""",,120,426.7,0.36,0.63,0.018,0.021,0.19,1.84,0.06,0.23,0.0,0.0,0.0,43.3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Grange and Baughman, 1956""","""AISI-SAE 4640""",,115200,204.4,0.36,0.63,0.018,0.021,0.19,1.84,0.06,0.23,0.0,0.0,0.0,50.6
"""Grange and Baughman, 1956""","""AISI-SAE 4640""",,115200,315.6,0.36,0.63,0.018,0.021,0.19,1.84,0.06,0.23,0.0,0.0,0.0,45.4
"""Grange and Baughman, 1956""","""AISI-SAE 4640""",,115200,426.7,0.36,0.63,0.018,0.021,0.19,1.84,0.06,0.23,0.0,0.0,0.0,38.6
"""Grange and Baughman, 1956""","""AISI-SAE 4640""",,115200,537.8,0.36,0.63,0.018,0.021,0.19,1.84,0.06,0.23,0.0,0.0,0.0,30.9


In [26]:
df_clean.filter(pl.col('tempering_time_s').eq(115200))

source,steel_type,initial_hardness_post_quenching_HRC,tempering_time_s,tempering_temperature_C,C,Mn,P,S,Si,Ni,Cr,Mo,V,Al,Cu,final_hardness_post_tempering_HRC
str,str,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Grange and Baughman, 1956""","""AISI-SAE 1335""",,115200,204.4,0.35,1.85,0.02,0.026,0.19,0.01,0.03,0.0,0.0,0.0,0.0,50.3
"""Grange and Baughman, 1956""","""AISI-SAE 1335""",,115200,315.6,0.35,1.85,0.02,0.026,0.19,0.01,0.03,0.0,0.0,0.0,0.0,46.5
"""Grange and Baughman, 1956""","""AISI-SAE 1335""",,115200,426.7,0.35,1.85,0.02,0.026,0.19,0.01,0.03,0.0,0.0,0.0,0.0,36.4
"""Grange and Baughman, 1956""","""AISI-SAE 1335""",,115200,537.8,0.35,1.85,0.02,0.026,0.19,0.01,0.03,0.0,0.0,0.0,0.0,24.2
"""Grange and Baughman, 1956""","""AISI-SAE 1335""",,115200,648.9,0.35,1.85,0.02,0.026,0.19,0.01,0.03,0.0,0.0,0.0,0.0,12.4
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Grange and Baughman, 1956""","""AISI-SAE 6145""",,115200,204.4,0.43,0.74,0.019,0.024,0.23,0.06,0.92,0.0,0.16,0.0,0.0,52.9
"""Grange and Baughman, 1956""","""AISI-SAE 6145""",,115200,315.6,0.43,0.74,0.019,0.024,0.23,0.06,0.92,0.0,0.16,0.0,0.0,49.3
"""Grange and Baughman, 1956""","""AISI-SAE 6145""",,115200,426.7,0.43,0.74,0.019,0.024,0.23,0.06,0.92,0.0,0.16,0.0,0.0,42.5
"""Grange and Baughman, 1956""","""AISI-SAE 6145""",,115200,537.8,0.43,0.74,0.019,0.024,0.23,0.06,0.92,0.0,0.16,0.0,0.0,35.1


In [27]:
df_clean.columns

['source',
 'steel_type',
 'initial_hardness_post_quenching_HRC',
 'tempering_time_s',
 'tempering_temperature_C',
 'C',
 'Mn',
 'P',
 'S',
 'Si',
 'Ni',
 'Cr',
 'Mo',
 'V',
 'Al',
 'Cu',
 'final_hardness_post_tempering_HRC']

In [28]:
df = df_clean.select(['steel_type','C','Mn','P','S','Si','Ni','Cr','Mo', 'V', 'Al', 'Cu', 'final_hardness_post_tempering_HRC'])

In [29]:
alloying_elements = df.columns[1:-1]
alloying_elements

['C', 'Mn', 'P', 'S', 'Si', 'Ni', 'Cr', 'Mo', 'V', 'Al', 'Cu']

In [31]:
alt.Chart(df).mark_boxplot().encode(
    alt.X(alt.repeat('row')),
    alt.Y('final_hardness_post_tempering_HRC')
    # alt.Color('steel_type')
).repeat(
        row = alloying_elements,
    ).properties(title="Alloying element effect on hardnessby Temp")