# Continusing processing without Pint

In [245]:
import polars as pl
import numpy as np
import pathlib
import re

In [246]:
cwd = pathlib.Path.cwd()

if cwd.name == 'Mild-Steel-Tempering':
    print("Path is projregexct current working directory to the project root")

Path is projregexct current working directory to the project root


In [247]:
resources_path = pathlib.PurePath(pathlib.PurePath(cwd), 'resources')
resources_path

PurePosixPath('/home/mox/Documents/coding_projects/bootcamp_local/Homeworks/Mild-Steel-Tempering/resources')

In [248]:
data_path = f"{resources_path}/scraped_properties.csv"
df_scraped = pl.read_csv(data_path)
df_scraped.glimpse()

Rows: 28
Columns: 6
$ steel           <str> 'AISI 6145', 'AISI-SAE 4027', 'AISI 1090', 'AISI 4640', 'AISI-SAE 1080', 'AISI 4047', 'AISI-SAE 1030', 'AISI-SAE 5140', 'AISI 1095', 'AISI-SAE 1050'
$ density         <str> '7.75', '7.85', '7.85', '7.75', '7.7-8.03', '7.85', '7.85', '7.85', '7.85', '7.85'
$ units_density   <str> 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cc', 'g/cm3', 'g/cm3', 'g/cm3'
$ poissons_ratio  <str> '0.27-030', '0.27-0.30', '0.27-0.30', '0.27-030', '0.27-0.30', '0.27-0.30', '0.27-0.30', '0.27-0.30', '0.27-0.30', '0.27-0.30'
$ elastic_modulus <f64> 7.85, 7.85, 7.85, 7.85, 7.85, 7.85, 7.85, 7.85, 7.85, 7.85
$ units_em        <str> 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3', 'g/cm3'



In [249]:
df_scraped.select(['units_density', 'units_em']).unique()

units_density,units_em
str,str
"""g/cc""","""g/cm3"""
"""g/cm3""","""g/cm3"""
"""g/""","""g/cm3"""


All units in units_density are a 1:1 conversion with g/cm3

g/mL (water) = g/cc = g/cm3 = grams per cubic centimeter

In [250]:
df_scraped_unit_header = df_scraped.select(['steel', 
                                            'density', 
                                            'poissons_ratio', 
                                            'elastic_modulus']).rename({'density' : 'density_g_per_cm3',
                                                                        'elastic_modulus' : 'elastic_modulus_g_per_cm3'})

print(df_scraped_unit_header.shape)    
df_scraped_unit_header.head()

(28, 4)


steel,density_g_per_cm3,poissons_ratio,elastic_modulus_g_per_cm3
str,str,str,f64
"""AISI 6145""","""7.75""","""0.27-030""",7.85
"""AISI-SAE 4027""","""7.85""","""0.27-0.30""",7.85
"""AISI 1090""","""7.85""","""0.27-0.30""",7.85
"""AISI 4640""","""7.75""","""0.27-030""",7.85
"""AISI-SAE 1080""","""7.7-8.03""","""0.27-0.30""",7.85


In [251]:
df_scraped_unit_header.describe()

statistic,steel,density_g_per_cm3,poissons_ratio,elastic_modulus_g_per_cm3
str,str,str,str,f64
"""count""","""28""","""28""","""28""",28.0
"""null_count""","""0""","""0""","""0""",0.0
"""mean""",,,,7.85
"""std""",,,,3.6179e-15
"""min""","""AISI 1030""","""7.7-8.03""","""0.27 – 0.30""",7.85
"""25%""",,,,7.85
"""50%""",,,,7.85
"""75%""",,,,7.85
"""max""","""AISI-SAE 6150""","""7.87""","""0.29""",7.85


### Many values have leading zero but no decimal resulting in parsing as whole number not decimal value

In [252]:
def if_no_decimal_point(num):
    #If leading 0 followed by number then replace leading 0 with "0."
    regex = r"^0\d"
    if re.search(regex, num):
        #replace the 
        num = re.sub("^0", "0.", num)
    return np.float64(num)

def strip_string(string):
    return string.strip()


In [253]:
df_scraped_clean = df_scraped_unit_header.clone()
s_split = df_scraped_clean['density_g_per_cm3'].str.split('-')
s_numeric_l = s_split.map_elements(lambda x: list(map(if_no_decimal_point, x)))
s_meand = s_numeric_l.list.mean()

df_scraped_clean.with_columns(density_g_per_cm3 = s_meand)
# AISI-SAE 5160 and AISI-SAE 6150 use a different dash than the others for the pr range
# Discovered when 2 nulls showed up after processing
s_same_dashes = df_scraped_clean["poissons_ratio"].str.replace('–', '-')
s_split = s_same_dashes.str.split('-')
s_numeric_l = s_split.map_elements(lambda x: list(map(if_no_decimal_point, x)))
s_meanp = s_numeric_l.list.mean()

df_scraped_clean = df_scraped_clean.with_columns(poissons_ratio = s_meanp, density_g_per_cm3 = s_meand)
df_scraped_clean.describe()

  s_numeric_l = s_split.map_elements(lambda x: list(map(if_no_decimal_point, x)))
  s_numeric_l = s_split.map_elements(lambda x: list(map(if_no_decimal_point, x)))


statistic,steel,density_g_per_cm3,poissons_ratio,elastic_modulus_g_per_cm3
str,str,f64,f64,f64
"""count""","""28""",28.0,28.0,28.0
"""null_count""","""0""",0.0,0.0,0.0
"""mean""",,7.844179,0.285179,7.85
"""std""",,0.028724,0.000945,3.6179e-15
"""min""","""AISI 1030""",7.75,0.285,7.85
"""25%""",,7.85,0.285,7.85
"""50%""",,7.85,0.285,7.85
"""75%""",,7.85,0.285,7.85
"""max""","""AISI-SAE 6150""",7.87,0.29,7.85


# Iterations on regex

In [187]:
s_split = df_scraped_unit_header['poissons_ratio'].str.split('-')

def if_no_decimal_point(num):
    #If leading 0 followed by number then replace leading 0 with "0."
    regex = r"^0\d"
    if re.search(regex, num):
        #replace the 
        num = re.sub("^0", "0.", num)
    return np.float64(num)

s_numeric_l = s_split.map_elements(lambda x: list(map(if_no_decimal_point, x)))


  s_numeric_l = s_split.map_elements(lambda x: list(map(if_no_decimal_point, x)))


In [174]:
def if_no_decimal(num):
    #If leading 0 followed by number then replace leading 0 with "0."
    regex = r"^0\d"
    if re.search(regex, num):
        #replace the 
        num = re.sub("^0", "0.", num)
    return np.float64(num)

lst = ["030","0.40",".5","13"]

list(map(if_no_decimal, lst))

[np.float64(0.3), np.float64(0.4), np.float64(0.5), np.float64(13.0)]

In [168]:
regex = r"^0\d"

for string in ["040","0.40","40","04",".4"]:
    print(re.search(regex, string))


<re.Match object; span=(0, 2), match='04'>
None
None
<re.Match object; span=(0, 2), match='04'>
None


# Pint does not functionn as desired for units with a denominator and the documentation is out of date. 

Scrapping that section of the work. Units can be done manually

In [45]:
!pip install pandas pint_pandas

Collecting pint_pandas
  Downloading Pint_Pandas-0.6.2-py3-none-any.whl.metadata (3.5 kB)
Downloading Pint_Pandas-0.6.2-py3-none-any.whl (27 kB)
Installing collected packages: pint_pandas
Successfully installed pint_pandas-0.6.2


In [10]:
import pathlib

In [92]:
# New imports
import pandas as pd
import pint
import pint_pandas
pint_pandas.show_versions()
ureg = pint.UnitRegistry()
pint.UnitRegistry().default_system = 'cgs'

{'numpy': '2.1.1', 'pandas': '2.2.3', 'pint': '0.24.3', 'pint_pandas': '0.6.2'}


In [11]:
cwd = pathlib.Path.cwd()

if cwd.name == 'Mild-Steel-Tempering':
    print("Path is project root")
else:
    print("Please correct current working directory to the project root")

Path is project root


In [12]:
resources_path = pathlib.PurePath(pathlib.PurePath(cwd), 'resources')
resources_path

PurePosixPath('/home/mox/Documents/coding_projects/bootcamp_local/Homeworks/Mild-Steel-Tempering/resources')

In [16]:
data_path = f"{resources_path}/scraped_properties.csv"
df_data = pd.read_csv(data_path)
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   steel            28 non-null     object 
 1   density          28 non-null     object 
 2   units_density    28 non-null     object 
 3   poissons_ratio   28 non-null     object 
 4   elastic_modulus  28 non-null     float64
 5   units_em         28 non-null     object 
dtypes: float64(1), object(5)
memory usage: 1.4+ KB


In [19]:
df_data['units_density'].unique()

array(['g/cm3', 'g/cc', 'g/'], dtype=object)

In [20]:
df_data['units_em'].unique()

array(['g/cm3'], dtype=object)

# Clean units

Unnecessary step all units are variations of grams per cubic centimeter

This step is included to provide a guideline on how to unify units if there were different scales or to conversions to be done

## pint can not be used for unit type conversion in polars. 

Pandas must be used. 

All units in units_density are a 1:1 conversion with g/cm3

g/mL (water) = g/cc = g/cm3 = grams per cubic centimeter

In [156]:
# pint does not recognize cm3 a a unit. cc is used. could define cm3 as cc
df_data['units_density'] = "g/cm3"

In [157]:
df_data['units_density'].unique()

array(['g/cm3'], dtype=object)

In [158]:
g_per_cc = ureg.g / ureg.cc
g_per_cc

In [99]:
ureg.define('density = g / cc')

In [100]:
pint_density = df_data['density'].astype('pint[density]')
pint_density

UndefinedUnitError: 'density' is not defined in the unit registry

In [93]:
pint_vol = 500 * ureg.cc
pint_vol

In [94]:
pint_mass = pint_density * pint_vol

ValueError: Cannot operate with Quantity and Quantity of different registries.

## Demonstrate a unit conversion using pint

g to kg should divide the values by 1000

then cc to L should multiply by 1000

In [84]:
pint_density.pint.to("kg / cc")

TypeError: can't multiply sequence by non-int of type 'float'

# Experimenting with pint and pandas


In [55]:

df = pd.DataFrame({
    "density": pd.Series([1, 2, 2, 3], dtype="pint[g / cc]"),
    "angular_velocity": pd.Series([1, 2, 2, 3], dtype="pint[cc]"),
})
df



Unnamed: 0,density,angular_velocity
0,1,1
1,2,2
2,2,2
3,3,3


In [65]:

df = pd.DataFrame({
    "density": pd.Series([1, 2, 2, 3], dtype="pint[g / cc]"),
    "vol": pd.Series([1, 2, 2, 3], dtype="pint[cc]"),
})
df


Unnamed: 0,density,vol
0,1,1
1,2,2
2,2,2
3,3,3


In [61]:
df['grams'] = df['density'] * df['vol']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype                        
---  ------   --------------  -----                        
 0   density  4 non-null      pint[gram / cubic_centimeter]
 1   vol      4 non-null      pint[cubic_centimeter]       
 2   grams    4 non-null      pint[gram]                   
dtypes: pint[cubic_centimeter](1), pint[gram / cubic_centimeter](1), pint[gram](1)
memory usage: 240.0 bytes
