In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import xgboost as xgb



In [77]:
# Load the dataset
  # Update with your file path
# filepath = "C:\\Users\\Admin\\Downloads\\RMIPRMODEL\\SAMPLE_DATASET.xlsx" 

filepath = "C:\\Users\\Admin\\Downloads\\RMIPRMODEL\\finaldata.xlsx"

df = pd.read_excel(filepath)

# Display the first few rows
print(df.head())


       Environment    Material_Group Material_Family Material    Rate     UNS  \
0  Phosphoric Acid  Stainless steels      Austenitic      316  0.0025  S31600   
1  Phosphoric Acid  Stainless steels      Austenitic      316     NaN  S31600   
2  Phosphoric Acid  Stainless steels      Austenitic      316  0.0050  S31600   
3  Phosphoric Acid  Stainless steels      Austenitic      316  0.0020  S31600   
4  Phosphoric Acid  Stainless steels      Austenitic      316  0.0050  S31600   

   Condition/Comment  Concentration  Temperature_degC  Temperature_degF  \
0                1.0           10.0                93               200   
1                1.0           10.0               100               212   
2                1.0           10.0                93               200   
3                1.0           10.0                93               200   
4                0.0           10.0                93               200   

   Duration_days  
0          16.00  
1           0.08  
2    

In [78]:
!pip install openpyxl

Defaulting to user installation because normal site-packages is not writeable


In [79]:
# Overview of the dataset
print(df.info())
# Descriptive statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Environment        175 non-null    object 
 1   Material_Group     175 non-null    object 
 2   Material_Family    175 non-null    object 
 3   Material           175 non-null    object 
 4   Rate               155 non-null    float64
 5   UNS                172 non-null    object 
 6   Condition/Comment  142 non-null    float64
 7   Concentration      167 non-null    float64
 8   Temperature_degC   175 non-null    int64  
 9   Temperature_degF   175 non-null    int64  
 10  Duration_days      159 non-null    float64
dtypes: float64(4), int64(2), object(5)
memory usage: 15.2+ KB
None
             Rate  Condition/Comment  Concentration  Temperature_degC  \
count  155.000000         142.000000     167.000000        175.000000   
mean     5.649330           1.284507      64.976048      

In [80]:
# Ensure correct column names are used
df.columns = df.columns.str.strip()  # Remove any extra spaces around column names

# Verify all required columns are present
required_columns = ['Environment', 'Material_Group', 'Material_Family', 'Material', 
                    'Rate', 'UNS', 'Condition/Comment', 
                    'Concentration', 'Temperature_degC', 
                    'Temperature_degF', 'Duration_days']

missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Missing columns in dataset: {missing_columns}")
else:
    print("All required columns are present.")

# Drop rows with missing target values
df = df.dropna(subset=['Rate'])

# Impute missing predictor values
imputer = SimpleImputer(strategy='most_frequent')
df[required_columns] = imputer.fit_transform(df[required_columns])

# Encoding categorical columns
categorical_columns = ['Environment', 'Material_Group', 'Material_Family', 'Material', 'UNS']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Encode 'Condition/Comment' as ordinal categories
df['Condition/Comment'] = df['Condition/Comment'].map({1: 'not aerated', 0: 'aerated', 2: 'welded', 3: 'other', 4: 'max corrosion rate'}).fillna('other').replace({'not aerated': 1, 'aerated': 0, 'welded': 2, 'other': 3, 'max corrosion rate': 4}).astype(int)

# Defining numerical columns
numerical_columns = ['Concentration', 'Temperature_degC', 'Temperature_degF', 'Duration_days']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
    ],
    remainder='passthrough'
)

# Split the data
X = df.drop('Rate', axis=1)
y = df['Rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2 Score:", r2_score(y_test, y_pred))

# Feature importances
if hasattr(model.named_steps['regressor'], 'feature_importances_'):
    feature_importances = model.named_steps['regressor'].feature_importances_
    features = X_train.columns
    sorted_indices = np.argsort(feature_importances)[::-1]
    print("Feature importances:")
    for i in sorted_indices:
        print(f"{features[i]}: {feature_importances[i]}")
else:
    print("The regressor does not have feature importances attribute.")

# Save the model
joblib.dump(model, 'corrosion_rate_prediction_model.pkl')


model = joblib.load('corrosion_rate_prediction_model.pkl')

All required columns are present.


  df['Condition/Comment'] = df['Condition/Comment'].map({1: 'not aerated', 0: 'aerated', 2: 'welded', 3: 'other', 4: 'max corrosion rate'}).fillna('other').replace({'not aerated': 1, 'aerated': 0, 'welded': 2, 'other': 3, 'max corrosion rate': 4}).astype(int)


Mean Absolute Error (MAE): 1.646062887224245
Mean Squared Error (MSE): 21.642396030745928
Root Mean Squared Error (RMSE): 4.652138866236253
R^2 Score: 0.45404997744960474
Feature importances:
Temperature_degC: 0.3873521906709474
Concentration: 0.3311665232012413
Temperature_degF: 0.18539006214024323
UNS_S31603: 0.022148276900123057
Material_316L: 0.019537195272207975
Condition/Comment: 0.017237057061973884
UNS_N08904: 0.009126734556965069
Material_Carpenter 20Cb3 Carpenter Technology Corp.: 0.007140040800003216
Material_316: 0.006745447149387293
Material_Alloy 904L: 0.004427528461924312
UNS_N08020: 0.004296037862638962
UNS_S31600: 0.0025643240361199198
Duration_days: 0.0015685421626353772
Material_Alloy 20: 0.0005921738047638129
UNS_N08777: 0.0003013889396229615
Material_JS777: 0.0002245249845169697
Material_Jessop JS700 Jessop Steel Co.: 7.767518624424789e-05
UNS_N08700: 6.638125032549055e-05
Material_Carpenter 20Mo-6 Carpenter Technology Corp.: 3.2883674492302725e-05
UNS_N08026: 2.90

['corrosion_rate_prediction_model.pkl']

In [90]:
import pandas as pd
import joblib

# Load the saved model
model = joblib.load('corrosion_rate_prediction_model.pkl')

# Define new data for prediction
new_data = pd.DataFrame({
    'Environment': ['Phosphoric Acid'],
    'Material_Group': ['Stainless steels'],
    'Material_Family': ['Austenitic'],
    'Material': ['316L'],
    'UNS': ['S31603'],
    'Condition/Comment': [2],
    'Concentration': [53],
    'Temperature_degC': [93],
    'Temperature_degF': [200],
    'Duration_days': [30]
})

# Apply preprocessing to the new data
# The model includes preprocessing, so we don't need to preprocess separately

# Predict the corrosion rate using the loaded model
predicted_rate = model.predict(new_data)
print(f"Predicted Corrosion Rate: {predicted_rate[0]}")


ValueError: columns are missing: {'Material_316', 'Material_Jessop JS700 Jessop Steel Co.', 'Material_Uddeholm alloy 904L', 'UNS_S31700', 'UNS_N08700', 'UNS_S31703', 'Material_317', 'Material_317L', 'Material_Carpenter 20Cb3 Carpenter Technology Corp.', 'UNS_N08777', 'UNS_S31600', 'Material_Alloy 20', 'UNS_S31603', 'Material_Alloy 904L', 'Environment_Phosphorus', 'UNS_N08904', 'UNS_S30400', 'Material_316L', 'UNS_N08020', 'Material_Carpenter 20 Carpenter Technology Corp.', 'Material_Carpenter 20Mo-6 Carpenter Technology Corp.', 'UNS_N08026', 'Material_Durimet 20', 'Material_JS777'}