In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import pickle
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
input_path = '/content/drive/My Drive/StockX-Data-Contest-2019-3.csv'
output_dir = '/content/drive/My Drive/SneakerPricePrediction/'
os.makedirs(output_dir, exist_ok=True)
output_csv = os.path.join(output_dir, 'processed_sneaker_data.csv')

# Load dataset
df = pd.read_csv(input_path)

# Clean 'Sale Price' and 'Retail Price' (remove $ and convert to float)
df['Sale Price'] = df['Sale Price'].replace('[\$,]', '', regex=True).astype(float)
df['Retail Price'] = df['Retail Price'].replace('[\$,]', '', regex=True).astype(float)

# Handle missing values
df = df.dropna(subset=['Order Date', 'Release Date', 'Sale Price', 'Retail Price', 'Shoe Size', 'Buyer Region', 'Sneaker Name', 'Brand'])

# Convert date columns to datetime
df['Order Date'] = pd.to_datetime(df['Order Date'], format='%m/%d/%y', errors='coerce')
df['Release Date'] = pd.to_datetime(df['Release Date'], format='%m/%d/%y', errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['Order Date', 'Release Date'])

# Validate dates (ensure Order Date >= Release Date)
df = df[df['Order Date'] >= df['Release Date']]

# Sort by Order Date for time-based split
df = df.sort_values('Order Date')

# Feature engineering
# Days since release
df['Days_Since_Release'] = (df['Order Date'] - df['Release Date']).dt.days

# Extract year and month from Release Date
df['Release_Year'] = df['Release Date'].dt.year
df['Release_Month'] = df['Release Date'].dt.month

# New features
# Regional sales volume (count of sales per region)
df['Regional_Sales_Volume'] = df.groupby('Buyer Region')['Buyer Region'].transform('count')

# Sneaker rarity (inverse of sales volume per sneaker)
df['Sneaker_Rarity'] = 1 / df.groupby('Sneaker Name')['Sneaker Name'].transform('count')

# Monthly average sale price and volatility (by order year-month)
df['Order_YearMonth'] = df['Order Date'].dt.to_period('M')
df['Monthly_Avg_Price'] = df.groupby('Order_YearMonth')['Sale Price'].transform('mean')
df['Market_Volatility'] = df.groupby('Order_YearMonth')['Sale Price'].transform('std').fillna(df['Sale Price'].std())

# Price trend (ratio of Monthly_Avg_Price to historical average)
historical_avg_price = df['Sale Price'].mean()
df['Price_Trend'] = df['Monthly_Avg_Price'] / historical_avg_price
df = df.drop('Order_YearMonth', axis=1)

# Order year
df['Order_Year'] = df['Order Date'].dt.year

# Sneaker age (days since release normalized, avoid division by zero)
df['Sneaker_Age'] = df['Days_Since_Release'] / np.maximum(df['Order_Year'] - df['Release_Year'] + 1, 1)
df['Sneaker_Age'] = df['Sneaker_Age'].clip(lower=0, upper=1000)

# Sneaker hype (sales volume per sneaker relative to average)
avg_sneaker_sales = df.groupby('Sneaker Name')['Sneaker Name'].transform('count').mean()
df['Sneaker_Hype'] = df.groupby('Sneaker Name')['Sneaker Name'].transform('count') / avg_sneaker_sales

# Validate for inf or NaN
for col in ['Sneaker_Age', 'Price_Trend', 'Sneaker_Hype', 'Market_Volatility']:
    if np.isinf(df[col]).any() or pd.isna(df[col]).any():
        print(f"Warning: {col} contains inf or NaN values")
        df = df[~np.isinf(df[col]) & ~pd.isna(df[col])]

# Encode categorical variables
le_brand = LabelEncoder()
le_sneaker = LabelEncoder()
le_region = LabelEncoder()

df['Brand_Encoded'] = le_brand.fit_transform(df['Brand'])
df['Sneaker_Name_Encoded'] = le_sneaker.fit_transform(df['Sneaker Name'])
df['Buyer_Region_Encoded'] = le_region.fit_transform(df['Buyer Region'])

# Select features for modeling, include additional columns for output
features = [
    'Retail Price', 'Shoe Size', 'Days_Since_Release', 'Release_Year', 'Release_Month',
    'Brand_Encoded', 'Sneaker_Name_Encoded', 'Buyer_Region_Encoded',
    'Regional_Sales_Volume', 'Sneaker_Rarity', 'Monthly_Avg_Price', 'Order_Year', 'Sneaker_Age', 'Price_Trend', 'Sneaker_Hype', 'Market_Volatility'
]
target = 'Sale Price'
additional_columns = ['Order Date', 'Sneaker Name', 'Brand', 'Release Date', 'Shoe Size']

# Create processed dataset
processed_df = df[features + additional_columns + [target]]

# Save processed dataset
processed_df.to_csv(output_csv, index=False)

# Save encoders for app usage
with open(os.path.join(output_dir, 'le_brand.pkl'), 'wb') as f:
    pickle.dump(le_brand, f)
with open(os.path.join(output_dir, 'le_sneaker.pkl'), 'wb') as f:
    pickle.dump(le_sneaker, f)
with open(os.path.join(output_dir, 'le_region.pkl'), 'wb') as f:
    pickle.dump(le_region, f)

print(f"Preprocessing complete. Processed data saved to {output_csv}.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Preprocessing complete. Processed data saved to /content/drive/My Drive/SneakerPricePrediction/processed_sneaker_data.csv.


In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import shap
import matplotlib.pyplot as plt
import pickle
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
input_csv = '/content/drive/My Drive/SneakerPricePrediction/processed_sneaker_data.csv'
output_dir = '/content/drive/My Drive/SneakerPricePrediction/'
os.makedirs(output_dir, exist_ok=True)
model_path = os.path.join(output_dir, 'xgboost_sneaker_model.json')
model_random_path = os.path.join(output_dir, 'xgboost_sneaker_model_random.json')
shap_summary_path = os.path.join(output_dir, 'shap_summary.png')
shap_importance_path = os.path.join(output_dir, 'shap_importance.png')
actual_vs_predicted_path = os.path.join(output_dir, 'actual_vs_predicted.png')
error_over_time_path = os.path.join(output_dir, 'error_over_time.png')
error_histogram_path = os.path.join(output_dir, 'error_histogram.png')
residual_plot_path = os.path.join(output_dir, 'residual_plot.png')
error_boxplot_path = os.path.join(output_dir, 'error_boxplot.png')
predicted_vs_actual_line_path = os.path.join(output_dir, 'predicted_vs_actual_line.png')
predicted_prices_path = os.path.join(output_dir, 'predicted_sneaker_prices.csv')

# Load processed dataset
df = pd.read_csv(input_csv)

# Clean Brand column (remove leading/trailing spaces)
df['Brand'] = df['Brand'].str.strip()

# Define features and target
features = [
    'Retail Price', 'Shoe Size', 'Days_Since_Release', 'Release_Year', 'Release_Month',
    'Brand_Encoded', 'Sneaker_Name_Encoded', 'Buyer_Region_Encoded',
    'Regional_Sales_Volume', 'Sneaker_Rarity', 'Monthly_Avg_Price', 'Order_Year', 'Sneaker_Age', 'Price_Trend', 'Sneaker_Hype', 'Market_Volatility'
]
target = 'Sale Price'

# Validate for inf or NaN in features
for col in features:
    if np.isinf(df[col]).any() or pd.isna(df[col]).any():
        print(f"Warning: {col} contains inf or NaN values")
        df = df[~np.isinf(df[col]) & ~pd.isna(df[col])]

# Time-based train-test split (80% train, 20% test)
train_size = int(0.8 * len(df))
X_train = df[features].iloc[:train_size]
y_train = df[target].iloc[:train_size]
X_test = df[features].iloc[train_size:]
y_test = df[target].iloc[train_size:]

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300]
}
base_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    device='cuda',
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=4.0,  # L1 regularization
    reg_lambda=4.0,  # L2 regularization
    random_state=42
)
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=1
)
grid_search.fit(X_train, y_train)
model = grid_search.best_estimator_

# Cross-validation (5-fold)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores, rmse_scores, r2_scores = [], [], []
for train_idx, val_idx in kf.split(X_train):
    X_cv_train, X_cv_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    model.fit(X_cv_train, y_cv_train)
    y_cv_pred = model.predict(X_cv_val)
    mae_scores.append(mean_absolute_error(y_cv_val, y_cv_pred))
    rmse_scores.append(np.sqrt(mean_squared_error(y_cv_val, y_cv_pred)))
    r2_scores.append(r2_score(y_cv_val, y_cv_pred))

print("Cross-Validation Results (Time-Based Split):")
print(f"Mean MAE: {np.mean(mae_scores):.2f} ± {np.std(mae_scores):.2f}")
print(f"Mean RMSE: {np.mean(rmse_scores):.2f} ± {np.std(rmse_scores):.2f}")
print(f"Mean R²: {np.mean(r2_scores):.2f} ± {np.std(r2_scores):.2f}")

# Predictions on test set
y_pred = model.predict(X_test)

# Evaluation metrics on test set
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nTest Set Results (Time-Based Split):")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")
print(f"Best Parameters: {grid_search.best_params_}")

# Random split for comparison
X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(
    df[features], df[target], test_size=0.2, random_state=42
)
grid_search.fit(X_train_rand, y_train_rand)
model_rand = grid_search.best_estimator_
y_pred_rand = model_rand.predict(X_test_rand)
mae_rand = mean_absolute_error(y_test_rand, y_pred_rand)
rmse_rand = np.sqrt(mean_squared_error(y_test_rand, y_pred_rand))
r2_rand = r2_score(y_test_rand, y_pred_rand)

print("\nTest Set Results (Random Split):")
print(f"MAE: {mae_rand:.2f}")
print(f"RMSE: {rmse_rand:.2f}")
print(f"R²: {r2_rand:.2f}")
print(f"Best Parameters (Random Split): {grid_search.best_params_}")

# Actual vs. Predicted Scatter Plot (Time-Based Split)
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5, s=20)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Sale Price ($)')
plt.ylabel('Predicted Sale Price ($)')
plt.title('Actual vs. Predicted Sneaker Sale Prices (Time-Based Split)')
plt.tight_layout()
plt.savefig(actual_vs_predicted_path)
plt.close()

# Predicted vs. Actual Line Plot (Time-Based Split)
plt.figure(figsize=(12, 6))
plt.plot(y_test[:100].values, label="Actual", color="blue")
plt.plot(y_pred[:100], label="Predicted", color="orange", linestyle="--")
plt.title("Actual vs Predicted Sale Prices (First 100 Samples)")
plt.xlabel("Sample")
plt.ylabel("Price (Dollars)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(predicted_vs_actual_line_path)
plt.close()

# Error over Time Plot (Time-Based Split)
errors = np.abs(y_test - y_pred)
order_dates = pd.to_datetime(df['Order Date'].iloc[train_size:])
plt.figure(figsize=(10, 6))
plt.scatter(order_dates, errors, alpha=0.5, s=20)
plt.xlabel('Order Date')
plt.ylabel('Absolute Prediction Error ($)')
plt.title('Prediction Error Over Time (Time-Based Split)')
plt.tight_layout()
plt.savefig(error_over_time_path)
plt.close()

# Error Histogram (Time-Based Split)
plt.figure(figsize=(8, 6))
plt.hist(errors, bins=50, edgecolor='k')
plt.xlabel('Absolute Prediction Error ($)')
plt.ylabel('Frequency')
plt.title('Histogram of Prediction Errors (Time-Based Split)')
plt.tight_layout()
plt.savefig(error_histogram_path)
plt.close()

# Residual Plot (Time-Based Split)
residuals = y_pred - y_test
plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.5, s=20)
plt.axhline(0, color='r', linestyle='--', lw=2)
plt.xlabel('Predicted Sale Price ($)')
plt.ylabel('Residuals ($)')
plt.title('Residual Plot (Time-Based Split)')
plt.tight_layout()
plt.savefig(residual_plot_path)
plt.close()

# Error Boxplot by Year (Time-Based Split)
plt.figure(figsize=(10, 6))
test_df = pd.DataFrame({'Order_Year': df['Order_Year'].iloc[train_size:], 'Error': errors})
test_df.boxplot(column='Error', by='Order_Year')
plt.xlabel('Order Year')
plt.ylabel('Absolute Prediction Error ($)')
plt.title('Prediction Error by Year (Time-Based Split)')
plt.suptitle('')
plt.tight_layout()
plt.savefig(error_boxplot_path)
plt.close()

# SHAP feature importance (Time-Based Split)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# SHAP summary plot
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_test, show=False)
plt.tight_layout()
plt.savefig(shap_summary_path)
plt.close()

# SHAP bar plot for feature importance
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_test, plot_type='bar', show=False)
plt.tight_layout()
plt.savefig(shap_importance_path)
plt.close()

# Predictions and SHAP values for entire dataset (Random Split Model for App)
y_full_pred = model_rand.predict(df[features])
explainer_rand = shap.TreeExplainer(model_rand)
shap_values_full = explainer_rand.shap_values(df[features])

# Map feature names to app terminology
feature_mapping = {
    'Sneaker_Rarity': 'Rarity',
    'Sneaker_Hype': 'Demand',
    'Days_Since_Release': 'time',
    'Retail Price': 'Retail Price',
    'Shoe Size': 'Shoe Size',
    'Release_Year': 'Release Year',
    'Release_Month': 'Release Month',
    'Brand_Encoded': 'Brand',
    'Sneaker_Name_Encoded': 'Sneaker Name',
    'Buyer_Region_Encoded': 'Buyer Region',
    'Regional_Sales_Volume': 'Regional Sales Volume',
    'Monthly_Avg_Price': 'Monthly Avg Price',
    'Order_Year': 'Order Year',
    'Sneaker_Age': 'Sneaker Age',
    'Price_Trend': 'Price Trend',
    'Market_Volatility': 'Market Volatility'
}

# Prioritize Rarity, Demand, and time for SHAP features
priority_features = ['Sneaker_Rarity', 'Sneaker_Hype', 'Days_Since_Release']
shap_df = pd.DataFrame(shap_values_full, columns=features)
top_features = []
top_impacts = []
for i in range(len(shap_df)):
    shap_row = shap_df.iloc[i].copy()
    # Get absolute SHAP values for sorting
    abs_shap_row = shap_row.abs()
    # Initialize top 3 with priority features if they exist
    selected_features = []
    selected_impacts = []
    for feat in priority_features:
        if feat in features:
            selected_features.append(feature_mapping[feat])
            selected_impacts.append(shap_row[feat])
    # Fill remaining slots with highest absolute SHAP values
    remaining_features = [f for f in features if f not in priority_features]
    remaining_shap = abs_shap_row[remaining_features].sort_values(ascending=False)
    for feat in remaining_shap.index:
        if len(selected_features) < 3:
            selected_features.append(feature_mapping[feat])
            selected_impacts.append(shap_row[feat])
    top_features.append(selected_features[:3])
    top_impacts.append(selected_impacts[:3])

# Create output DataFrame
df_pred = pd.DataFrame({
    'Sneaker Name': df['Sneaker Name'],
    'Brand': df['Brand'],
    'Retail Price': df['Retail Price'].round(2),
    'Sale Price': df[target].round(2),
    'Predicted Price': y_full_pred.round(2),
    'Order Date': df['Order Date'],
    'Release Date': df['Release Date'],
    'Shoe Size': df['Shoe Size'].round(1),
    'SHAP Feature 1': [f[0] for f in top_features],
    'SHAP Impact 1': [round(i[0], 4) for i in top_impacts],
    'SHAP Feature 2': [f[1] for f in top_features],
    'SHAP Impact 2': [round(i[1], 4) for i in top_impacts],
    'SHAP Feature 3': [f[2] for f in top_features],
    'SHAP Impact 3': [round(i[2], 4) for i in top_impacts]
})

# Save predicted prices with SHAP data
df_pred.to_csv(predicted_prices_path, index=False)

# Save models
model.save_model(model_path)  # Time-based model
model_rand.save_model(model_random_path)  # Random split model

print(f"Training complete. Time-based model saved to {model_path}. Random split model saved to {model_random_path}.")
print(f"Plots saved to {shap_summary_path}, {shap_importance_path}, {actual_vs_predicted_path}, {error_over_time_path}, {error_histogram_path}, {residual_plot_path}, {error_boxplot_path}, {predicted_vs_actual_line_path}.")
print(f"Predicted prices saved to {predicted_prices_path}.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Cross-Validation Results (Time-Based Split):
Mean MAE: 20.82 ± 0.34
Mean RMSE: 40.58 ± 1.25
Mean R²: 0.98 ± 0.00

Test Set Results (Time-Based Split):
MAE: 90.77
RMSE: 149.79
R²: 0.43
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}

Test Set Results (Random Split):
MAE: 16.37
RMSE: 31.23
R²: 0.98
Best Parameters (Random Split): {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300}
Training complete. Time-based model saved to /content/drive/My Drive/SneakerPricePrediction/xgboost_sneaker_model.json. Random split model saved to /content/drive/My Drive/SneakerPricePrediction/xgboost_sneaker_model_random.json.
Plots saved to /content/drive/My Drive/SneakerPricePrediction/shap_summary.png, /content/drive/My Drive/SneakerPricePrediction/shap_importance.png, /content/drive/My Drive/SneakerPricePrediction/actual_vs_predicted.png, /c

<Figure size 1000x600 with 0 Axes>

In [None]:
import pandas as pd
import json
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
input_csv = '/content/drive/My Drive/SneakerPricePrediction/predicted_sneaker_prices.csv'
output_json = '/content/drive/My Drive/SneakerPricePrediction/predicted_sneaker_prices.json'

# Load CSV
df = pd.read_csv(input_csv)

# Convert dates to string format (YYYY-MM-DD)
df['Order Date'] = pd.to_datetime(df['Order Date']).dt.strftime('%Y-%m-%d')
df['Release Date'] = pd.to_datetime(df['Release Date']).dt.strftime('%Y-%m-%d')

# Ensure numeric columns are properly formatted
numeric_columns = ['Retail Price', 'Sale Price', 'Predicted Price', 'Shoe Size', 'SHAP Impact 1', 'SHAP Impact 2', 'SHAP Impact 3']
for col in numeric_columns:
    df[col] = df[col].astype(float)

# Convert DataFrame to list of dictionaries
json_data = df.to_dict(orient='records')

# Save to JSON file
with open(output_json, 'w') as f:
    json.dump(json_data, f, indent=2)

print(f"JSON file saved to {output_json}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
JSON file saved to /content/drive/My Drive/SneakerPricePrediction/predicted_sneaker_prices.json


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
input_csv = '/content/drive/My Drive/StockX-Data-Contest-2019-3.csv'
output_dir = '/content/drive/My Drive/SneakerPricePrediction/'
os.makedirs(output_dir, exist_ok=True)
sale_price_hist_path = os.path.join(output_dir, 'sale_price_histogram.png')
brand_boxplot_path = os.path.join(output_dir, 'brand_sale_price_boxplot.png')
correlation_matrix_path = os.path.join(output_dir, 'correlation_matrix.png')

# Load dataset
df = pd.read_csv(input_csv)

# Clean 'Sale Price' and 'Retail Price' (remove $ and convert to float)
df['Sale Price'] = df['Sale Price'].replace('[\$,]', '', regex=True).astype(float)
df['Retail Price'] = df['Retail Price'].replace('[\$,]', '', regex=True).astype(float)

# Convert date columns to datetime
df['Order Date'] = pd.to_datetime(df['Order Date'], format='%m/%d/%y', errors='coerce')
df['Release Date'] = pd.to_datetime(df['Release Date'], format='%m/%d/%y', errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['Order Date', 'Release Date'])

# Days since release
df['Days_Since_Release'] = (df['Order Date'] - df['Release Date']).dt.days

# Histogram of Sale Price
plt.figure(figsize=(10, 6))
plt.hist(df['Sale Price'], bins=50, edgecolor='k')
plt.xlabel('Sale Price ($)')
plt.ylabel('Frequency')
plt.title('Distribution of Sneaker Sale Prices')
plt.tight_layout()
plt.savefig(sale_price_hist_path)
plt.close()

# Box Plot of Sale Price by Brand
plt.figure(figsize=(12, 6))
sns.boxplot(x='Brand', y='Sale Price', data=df)
plt.xlabel('Brand')
plt.ylabel('Sale Price ($)')
plt.title('Sale Price Distribution by Brand')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(brand_boxplot_path)
plt.close()

# Correlation Matrix of Numeric Features
numeric_cols = ['Sale Price', 'Retail Price', 'Shoe Size', 'Days_Since_Release']
correlation_matrix = df[numeric_cols].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numeric Features')
plt.tight_layout()
plt.savefig(correlation_matrix_path)
plt.close()

print(f"Charts saved to {sale_price_hist_path}, {brand_boxplot_path}, {correlation_matrix_path}")

Mounted at /content/drive
Charts saved to /content/drive/My Drive/SneakerPricePrediction/sale_price_histogram.png, /content/drive/My Drive/SneakerPricePrediction/brand_sale_price_boxplot.png, /content/drive/My Drive/SneakerPricePrediction/correlation_matrix.png


In [1]:
import pandas as pd
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
input_csv = '/content/drive/My Drive/StockX-Data-Contest-2019-3.csv'
output_dir = '/content/drive/My Drive/SneakerPricePrediction/'
os.makedirs(output_dir, exist_ok=True)
sorted_csv = os.path.join(output_dir, 'sorted_sale_prices.csv')

# Load dataset
df = pd.read_csv(input_csv)

# Clean 'Sale Price' (remove $ and convert to float)
df['Sale Price'] = df['Sale Price'].replace('[\$,]', '', regex=True).astype(float)

# Handle missing values (if any)
df = df.dropna(subset=['Sale Price'])

# Find min and max Sale Price
min_sale_price = df['Sale Price'].min()
max_sale_price = df['Sale Price'].max()

# Sort the dataset by Sale Price (ascending)
df_sorted = df.sort_values(by='Sale Price', ascending=True)

# Save the sorted dataset to a new CSV
df_sorted.to_csv(sorted_csv, index=False)

# Print results
print(f"Minimum Sale Price: ${min_sale_price:.2f}")
print(f"Maximum Sale Price: ${max_sale_price:.2f}")
print(f"Sorted dataset saved to {sorted_csv}")

Mounted at /content/drive
Minimum Sale Price: $186.00
Maximum Sale Price: $4050.00
Sorted dataset saved to /content/drive/My Drive/SneakerPricePrediction/sorted_sale_prices.csv
