In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline



In [8]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Return rate: {train_df['ReturnFlag'].mean():.2%}")

Train shape: (8000, 25)
Test shape: (2000, 24)
Return rate: 50.46%


In [9]:
def add_aggregation_features(df, train_df=None):
    """
    - CustomerReturnRate: Serial returners vs loyal customers
    - ProductReturnRate: Defective products vs quality products
    - CategoryReturnRate: Category-level patterns
    """
    df = df.copy()
    reference = train_df if train_df is not None else df

    # Customer return rate 
    customer_rates = reference.groupby('CustomerID')['ReturnFlag'].mean().to_dict()
    df['CustomerReturnRate'] = df['CustomerID'].map(customer_rates).fillna(0.5)

    # Product return rate 
    product_rates = reference.groupby('ProductID')['ReturnFlag'].mean().to_dict()
    df['ProductReturnRate'] = df['ProductID'].map(product_rates).fillna(0.5)

    # Category return rate
    category_rates = reference.groupby('Category')['ReturnFlag'].mean().to_dict()
    df['CategoryReturnRate'] = df['Category'].map(category_rates).fillna(0.5)

    return df

def add_temporal_features(df):
    """Add time-based features"""
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['Day'] = df['Date'].dt.day
    df['Month'] = df['Date'].dt.month
    df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(int)
    df['IsPostChristmas'] = ((df['Month'] == 12) & (df['Day'] >= 26)).astype(int)
    df['IsChristmasWeek'] = ((df['Month'] == 12) & (df['Day'] >= 18)).astype(int)
    return df

def preprocess(df):
    """Select final 17 features"""
    features = [
        # Aggregations (3)
        'CustomerReturnRate', 'ProductReturnRate', 'CategoryReturnRate',
        # Original features (8)
        'Age', 'Quantity', 'TotalPrice', 'CustomerSatisfaction', 'DiscountAmount',
        'OnlineOrderFlag', 'PromotionApplied', 'GiftWrap',
        # Temporal features (6)
        'DayOfWeek', 'Day', 'Month', 'IsWeekend', 'IsPostChristmas', 'IsChristmasWeek'
    ]
    X = df[features].fillna(0)
    return X


In [10]:
# Apply feature engineering
train_enhanced = add_aggregation_features(train_df)
train_enhanced = add_temporal_features(train_enhanced)

print(f"  Training samples: {len(train_enhanced)}")
print(f"  Total features: {len(preprocess(train_enhanced).columns)}")

  Training samples: 8000
  Total features: 17


In [11]:
print("final model, training on all 8000 samples")

# Prepare training data
X_train = preprocess(train_enhanced)
y_train = train_enhanced['ReturnFlag'].values

# Train final model
final_model = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=3000, random_state=42))
])

final_model.fit(X_train, y_train)
print(f" model trained on {len(X_train)} samples")

# Prepare test data 
test_enhanced = add_aggregation_features(test_df, train_df=train_df)
test_enhanced = add_temporal_features(test_enhanced)
X_test = preprocess(test_enhanced)

print(f" test data prepared: {X_test.shape}")

# Generate predictions
predictions = final_model.predict(X_test)
print(f"predictions generated")

final model, training on all 8000 samples
 model trained on 8000 samples
 test data prepared: (2000, 17)
predictions generated


In [12]:
# Create submission
submission = pd.DataFrame({
    'TransactionID': test_df['TransactionID'],
    'ReturnFlag': predictions.astype(int)
})

# Save to CSV
submission.to_csv('submission_mohamed_final.csv', index=False)

print("\nFirst 10 predictions:")
print(submission.head(10))


First 10 predictions:
   TransactionID  ReturnFlag
0           8001           0
1           8002           0
2           8003           1
3           8004           1
4           8005           1
5           8006           0
6           8007           0
7           8008           0
8           8009           1
9           8010           1
