In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("data/Copper_Set.csv")

# Map categorical fields
item_type_mapping = {'W': 1, 'WI': 2, 'S': 3, 'Others': 4, 'PL': 5, 'IPL': 6, 'SLAWR': 7}
status_mapping = {'Lost': 0, 'Won': 1}
df['item type'] = df['item type'].map(item_type_mapping)
df['status'] = df['status'].map(status_mapping)

# Convert to numeric safely
numeric_cols = ['quantity tons', 'selling_price', 'application', 'thickness', 'width',
                'country', 'customer', 'product_ref']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Drop rows with missing values before transformation
df = df.dropna(subset=numeric_cols)

# Remove invalid selling prices (<= 0)
df = df[df['selling_price'] > 0]
df = df[df['thickness'] > 0]
df = df[df['quantity tons'] > 0]

# Apply log transformation
df['quantity_log'] = np.log(df['quantity tons'])
df['selling_price_log'] = np.log(df['selling_price'])
df['thickness_log'] = np.log(df['thickness'])

# Drop rows with NaN/inf in log-transformed columns (redundant safety)
df = df.replace([np.inf, -np.inf], np.nan).dropna()

# Define X and y
X = df[['quantity_log', 'status', 'item type', 'application', 'thickness_log',
        'width', 'country', 'customer', 'product_ref']]
y = df['selling_price_log']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler_reg = StandardScaler()
X_train_scaled = scaler_reg.fit_transform(X_train)
X_test_scaled = scaler_reg.transform(X_test)

# Model training
model_reg = XGBRegressor(n_estimators=150, max_depth=6, random_state=42)
model_reg.fit(X_train_scaled, y_train)

# Evaluation
y_pred = model_reg.predict(X_test_scaled)
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")

# Save model and scaler
with open("Regression_Model.pkl", "wb") as f:
    pickle.dump(model_reg, f)
with open("scaler_reg.pkl", "wb") as f:
    pickle.dump(scaler_reg, f)


  df = pd.read_csv("models/Copper_Set.csv")


RMSE: 0.1481
R2 Score: 0.6113
