In [1]:
# Simple implementation for processing  129 MB file


import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
import time
import os

# Your SAS URL
#sas_url = "https://vestiairedata1.blob.core.windows.net/vestiairecontainer?sp=racwdli&st=2025-04-27T19:48:10Z&se=2025-06-07T03:48:10Z&spr=https&sv=2024-11-04&sr=c&sig=e%2BAbOapdiakxuDf9%2BXw8%2BwLZp51dZqLdSg%2FmjmxPpCI%3D"
# Update your SAS URL to point to the specific file
sas_url = "https://vestiairedata1.blob.core.windows.net/vestiairecontainer/cleaned_data.parquet?sp=racwdli&st=2025-05-05T01:36:02Z&se=2025-05-15T09:36:02Z&spr=https&sv=2024-11-04&sr=c&sig=n32zqbxDDp%2FRehKIM88ScyLA87jJsJ%2FEmjloSPJ%2BIJc%3D"

def get_important_features():
    """Return the list of important features for modeling."""
    return [
        'seller_price', 'seller_badge_encoded', 'should_be_gone', 'seller_pass_rate',
        'price_to_earning_ratio', 'seller_products_sold', 'price_per_like', 'brand_id',
        'product_type', 'product_material', 'product_like_count', 'seller_num_products_listed',
        'seller_community_rank', 'seller_activity_ratio', 'product_color_encoded',
        'seller_num_followers', 'available', 'seller_country', 'in_stock',
        'product_season_encoded', 'usually_ships_within_encoded', 'product_condition_encoded',
        'warehouse_name_encoded'
    ]

# Step 1: Read the data
import requests
import pyarrow.parquet as pq
import pyarrow.fs
from io import BytesIO

print("Reading data from SAS URL...")

# If the SAS URL points to a container, you need to specify a file within it
# SAS URLs can be tricky, let's try multiple approaches:

# Method 1: Try direct read with storage_options
try:
    df = pd.read_parquet(sas_url)
    print(f"Data loaded successfully. Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
except:
    # Method 2: Try with requests
    try:
        response = requests.get(sas_url)
        response.raise_for_status()
        df = pd.read_parquet(BytesIO(response.content))
        print(f"Data loaded successfully. Shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")
    except Exception as e:
        # Method 3: Try with azure storage specific options
        try:
            from azure.storage.blob import BlobServiceClient
            
            # Extract account_url and sas_token from the URL
            parts = sas_url.split('?')
            if len(parts) == 2:
                blob_url = parts[0]
                sas_token = '?' + parts[1]
                
                # Try to list blobs if it's a container URL
                print("Attempting to list files in the container...")
                blob_service_client = BlobServiceClient(blob_url.split('/')[2], sas_token)
                container_name = blob_url.split('/')[-1]
                container_client = blob_service_client.get_container_client(container_name)
                
                # List first 5 files
                blob_list = container_client.list_blobs()
                files = []
                for blob in blob_list:
                    files.append(blob.name)
                    if len(files) >= 5:
                        break
                
                print("First 5 files in container:")
                for f in files:
                    print(f"  - {f}")
                
                # Try reading the first parquet file
                if files:
                    first_file = files[0]
                    full_url = f"{blob_url}/{first_file}{sas_token}"
                    df = pd.read_parquet(full_url)
                    print(f"Data loaded successfully from {first_file}. Shape: {df.shape}")
                    print(f"Columns: {df.columns.tolist()}")
                else:
                    print("No files found in container")
            else:
                print("Invalid SAS URL format")
        except Exception as e2:
            print(f"Error: {e2}")
            print("Please provide a valid SAS URL to a specific parquet file")
            # Try to give more context about the error
            print("\nYour SAS URL should point to:")
            print("1. A specific parquet file (preferred)")
            print("2. Or a container with read permissions")

# Step 2: Check if the data has the target variable (sold)
if 'sold' in df.columns:
    print("\nTraining a model...")
    
    # Get features
    features = get_important_features()
    
    # Check which features exist
    available_features = [f for f in features if f in df.columns]
    missing_features = [f for f in features if f not in df.columns]
    
    print(f"Available features: {len(available_features)}/{len(features)}")
    if missing_features:
        print(f"Missing features: {missing_features}")
    
    # Prepare data with available features
    X = df[available_features]
    y = df['sold']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    # Apply SMOTE-Tomek
    smote_tomek = SMOTETomek(random_state=42)
    X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)
    
    # Train XGBoost model
    model = xgb.XGBClassifier(
        eval_metric='auc',
        random_state=42,
        n_estimators=100,
        max_depth=5,
        n_jobs=-1
    )
    
    print("Training model...")
    model.fit(X_train_resampled, y_train_resampled)
    
    # Evaluate
    from sklearn.metrics import roc_auc_score
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    
    # Save model
    os.makedirs('models', exist_ok=True)
    joblib.dump(model, 'models/model.pkl')
    joblib.dump(available_features, 'models/feature_names.pkl')
    print("Model saved to 'models/' directory")
    
else:
    print("\nNo 'sold' column found. Loading model for prediction...")
    
    # Load model if it exists
    if os.path.exists('models/model.pkl'):
        model = joblib.load('models/model.pkl')
        feature_names = joblib.load('models/feature_names.pkl')
        
        # Prepare features
        X = df[feature_names]
        
        # Make predictions
        print("Making predictions...")
        predictions = model.predict_proba(X)[:, 1]
        
        # Create results dataframe
        results_df = pd.DataFrame({
            'id': df.index,
            'prediction': predictions
        })
        
        # Save results
        results_df.to_csv('predictions.csv', index=False)
        print(f"Predictions saved to 'predictions.csv'")
        print(f"Number of predictions: {len(results_df)}")
        print("\nSample predictions:")
        print(results_df.head())
        
    else:
        print("No model found. Please train a model first.")



FileNotFoundError: [Errno 2] No such file or directory: 'null/Users/berly.biju'