<a href="https://colab.research.google.com/github/NadeeraSilvaa/Admin_Panel/blob/main/03_price_estimator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Price Estimation Model
## CIS 6035 Final Project - Blockchain-Based AI Marketplace

Per proposal: "Regression model trained on historical data (e.g., Linear Regression)"

This notebook trains a regression model to estimate dataset prices based on:
- Dataset size (rows, columns, file size)
- Category
- Uniqueness and quality scores

In [1]:
# Install required packages
%pip install pandas numpy scikit-learn joblib



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load training data
try:
    df = pd.read_csv('data/training_data.csv')
    print(f"Loaded {len(df)} samples")
except FileNotFoundError:
    print("Training data not found. Generating inline...")
    # Generate minimal data for demonstration
    np.random.seed(42)
    n_samples = 1000
    categories = ['medical', 'finance', 'retail', 'automotive', 'other']

    df = pd.DataFrame({
        'category': np.random.choice(categories, n_samples),
        'row_count': np.random.randint(1000, 1000000, n_samples),
        'column_count': np.random.randint(5, 100, n_samples),
        'file_size_mb': np.random.uniform(1, 500, n_samples),
        'uniqueness_score': np.random.randint(50, 100, n_samples),
        'quality_score': np.random.randint(60, 100, n_samples)
    })

    # Generate prices based on features
    cat_mult = {'medical': 1.5, 'automotive': 1.4, 'finance': 1.3, 'retail': 1.0, 'other': 0.8}
    df['price'] = (
        np.log1p(df['row_count']) * 50 +
        df['column_count'] * 10 +
        df['file_size_mb'] * 2 +
        df['uniqueness_score'] * 5 +
        df['quality_score'] * 3
    ) * df['category'].map(cat_mult) * np.random.uniform(0.8, 1.2, n_samples)

df.head()

Loaded 5000 samples


Unnamed: 0,category,row_count,column_count,file_size_mb,columns,uniqueness_score,quality_score,price
0,medical,2729796,20,264.54,doctor|hospital|heart_rate|patient_id|medicati...,76,60,648.07
1,retail,2929719,67,1359.02,store|order_id|category|brand|review|cart|pric...,85,73,1585.93
2,retail,1591390,46,753.5,inventory|discount|quantity|price|rating|brand...,75,80,1236.3
3,medical,523084,96,494.79,hospital|health_score|gender|heart_rate|doctor...,82,76,1748.61
4,other,3492214,28,812.33,value|name|source|id|timestamp|score|target|st...,52,67,557.39


In [5]:
# Feature Engineering
print("Preparing features...")

# Encode category
category_encoder = LabelEncoder()
df['category_encoded'] = category_encoder.fit_transform(df['category'])

# Create feature matrix
feature_cols = ['row_count', 'column_count', 'file_size_mb', 'category_encoded',
                'uniqueness_score', 'quality_score']

# Add derived features
df['log_row_count'] = np.log1p(df['row_count'])
df['log_file_size'] = np.log1p(df['file_size_mb'])
df['data_density'] = df['row_count'] * df['column_count'] / (df['file_size_mb'] + 0.001)

extended_features = feature_cols + ['log_row_count', 'log_file_size', 'data_density']

X = df[extended_features].values
y = df['price'].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Feature matrix shape: {X_scaled.shape}")
print(f"Features: {extended_features}")

Preparing features...
Feature matrix shape: (5000, 9)
Features: ['row_count', 'column_count', 'file_size_mb', 'category_encoded', 'uniqueness_score', 'quality_score', 'log_row_count', 'log_file_size', 'data_density']


In [6]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Training samples: 4000
Test samples: 1000


In [7]:
# Train Linear Regression Model
print("Training Linear Regression model...")

model = Ridge(alpha=1.0)  # Ridge regression for better generalization
model.fit(X_train, y_train)

print("Training complete!")

# Evaluate Model
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"\n{'='*50}")
print(f"MODEL EVALUATION")
print(f"{'='*50}")
print(f"\nR² Score: {r2:.4f}")
print(f"Mean Absolute Error: ${mae:.2f}")
print(f"Root Mean Squared Error: ${rmse:.2f}")

if r2 >= 0.75:
    print("\n✅ Good model fit (R² >= 0.75)")
else:
    print(f"\n⚠️ Model could be improved (R² < 0.75)")

Training Linear Regression model...
Training complete!

MODEL EVALUATION

R² Score: 0.7346
Mean Absolute Error: $387.53
Root Mean Squared Error: $510.19

⚠️ Model could be improved (R² < 0.75)


In [8]:
# Save Model and Preprocessors
os.makedirs('models', exist_ok=True)

joblib.dump(model, 'models/price_estimator.pkl')
joblib.dump(scaler, 'models/price_scaler.pkl')
joblib.dump(category_encoder, 'models/price_category_encoder.pkl')

print("\nModels saved:")
print("  - models/price_estimator.pkl")
print("  - models/price_scaler.pkl")
print("  - models/price_category_encoder.pkl")
print("\n✅ Price estimator training complete!")


Models saved:
  - models/price_estimator.pkl
  - models/price_scaler.pkl
  - models/price_category_encoder.pkl

✅ Price estimator training complete!
