In [None]:
# Sample data cleaning code
import pandas as pd

def clean_data(orders_path, returns_path):
    # Load datasets
    orders = pd.read_csv("C:/Users/khasi/OneDrive/Desktop/elevate labs/Tasks/orders.csv")
    returns = pd.read_csv("C:/Users/khasi/OneDrive/Desktop/elevate labs/Tasks/returns.csv")
    
    # Merge datasets
    merged = pd.merge(orders, returns, on='order_id', how='left')
    
    # Create return flag
    merged['is_returned'] = merged['return_id'].notna().astype(int)
    
    # Handle missing values
    merged.fillna({'return_reason': 'no_return'}, inplace=True)
    
    # Convert dates
    merged['order_date'] = pd.to_datetime(merged['order_date'])
    merged['return_date'] = pd.to_datetime(merged['return_date'])
    
    return merged

In [7]:
# Sample analysis code
def analyze_returns(data):
    # Category analysis
    category_returns = data.groupby('category')['is_returned'].mean().sort_values(ascending=False)
    
    # Supplier analysis
    supplier_returns = data.groupby('supplier_id')['is_returned'].agg(['mean', 'count'])
    supplier_returns = supplier_returns[supplier_returns['count'] > 100].sort_values('mean', ascending=False)
    
    # Geographic analysis
    geo_returns = data.groupby('customer_state')['is_returned'].mean().sort_values(ascending=False)
    
    return {
        'category_returns': category_returns,
        'supplier_returns': supplier_returns,
        'geo_returns': geo_returns
    }

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def build_model(data):
    # Feature engineering
    X = data[['category', 'price', 'supplier_rating', 'customer_tenure', 
              'marketing_channel', 'delivery_days']]
    X = pd.get_dummies(X, columns=['category', 'marketing_channel'])
    
    y = data['is_returned']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Model training
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    # Evaluation
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    return model