BASE_LINE MODEL

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, roc_auc_score

In [3]:

# Load datasets (assuming they are in a 'data' directory)
orders = pd.read_csv(r'data/olist_orders_dataset.csv')
reviews = pd.read_csv(r'data/olist_order_reviews_dataset.csv')
items = pd.read_csv(r'data/olist_order_items_dataset.csv')
products = pd.read_csv(r'data/olist_products_dataset.csv')
payments = pd.read_csv(r'data/olist_order_payments_dataset.csv')
customers = pd.read_csv(r'data/olist_customers_dataset.csv')

# Classify Review Scores (1–5 stars)

# Merge reviews with orders
df = pd.merge(reviews[['order_id', 'review_score']], orders[['order_id', 'order_purchase_timestamp', 'order_delivered_customer_date']], on='order_id')

# Filter orders with only one item
item_counts = items.groupby('order_id').size().reset_index(name='item_count')
single_item_orders = item_counts[item_counts['item_count'] == 1]['order_id']
df = df[df['order_id'].isin(single_item_orders)]

# Filter orders with only one payment
payment_counts = payments.groupby('order_id').size().reset_index(name='payment_count')
single_payment_orders = payment_counts[payment_counts['payment_count'] == 1]['order_id']
df = df[df['order_id'].isin(single_payment_orders)]

# Merge with items and products
df = pd.merge(df, items[['order_id', 'product_id', 'price', 'freight_value']], on='order_id')
df = pd.merge(df, products[['product_id', 'product_category_name']], on='product_id')
df = pd.merge(df, payments[['order_id', 'payment_type', 'payment_installments']], on='order_id')

# Calculate delivery time
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])
df['delivery_time'] = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.days
df = df.dropna(subset=['delivery_time'])

# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categories = encoder.fit_transform(df[['product_category_name', 'payment_type']])
encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out())
df = pd.concat([df.reset_index(drop=True), encoded_df], axis=1)
df = df.drop(['product_category_name', 'payment_type', 'order_id', 'product_id', 'order_purchase_timestamp', 'order_delivered_customer_date'], axis=1)

# Select features and target
features = ['payment_installments', 'price', 'freight_value', 'delivery_time'] + list(encoded_df.columns)
X = df[features]
y = df['review_score']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train multinomial logistic regression model
model_task1 = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model_task1.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_task1 = model_task1.predict(X_test_scaled)
print("Task 1: Classify Review Scores")
print("Accuracy:", accuracy_score(y_test, y_pred_task1))
print("\nClassification Report:\n", classification_report(y_test, y_pred_task1))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_task1))

Task 1: Classify Review Scores
Accuracy: 0.6156359531871918

Classification Report:
               precision    recall  f1-score   support

           1       0.56      0.27      0.36      1394
           2       0.00      0.00      0.00       451
           3       0.00      0.00      0.00      1417
           4       1.00      0.00      0.00      3482
           5       0.62      0.99      0.76     10089

    accuracy                           0.62     16833
   macro avg       0.44      0.25      0.23     16833
weighted avg       0.62      0.62      0.49     16833


Confusion Matrix:
 [[ 377    0    0    0 1017]
 [  54    0    0    0  397]
 [  73    0    0    0 1344]
 [  67    0    0    1 3414]
 [ 104    0    0    0 9985]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
# Predict Likelihood of Repeat Purchases
# Merge orders with customers to get customer_unique_id
df_orders_customers = pd.merge(orders[['order_id', 'customer_id', 'order_purchase_timestamp']], customers[['customer_id', 'customer_unique_id', 'customer_state']], on='customer_id')

# Sort by customer_unique_id and order_purchase_timestamp to identify first purchases
df_orders_customers = df_orders_customers.sort_values(['customer_unique_id', 'order_purchase_timestamp'])

# Get first orders for each customer
first_orders = df_orders_customers.groupby('customer_unique_id').first().reset_index()

# Calculate number of orders per customer to determine repeat purchases
order_counts = df_orders_customers.groupby('customer_unique_id').size().reset_index(name='order_count')
first_orders = pd.merge(first_orders, order_counts, on='customer_unique_id')

# Define target: 1 if repeat purchase (order_count > 1), 0 otherwise
first_orders['repeat_purchase'] = (first_orders['order_count'] > 1).astype(int)

# Filter first orders with one item and one payment
first_order_ids = first_orders['order_id']
item_counts_first = items[items['order_id'].isin(first_order_ids)].groupby('order_id').size().reset_index(name='item_count')
payment_counts_first = payments[payments['order_id'].isin(first_order_ids)].groupby('order_id').size().reset_index(name='payment_count')
single_item_first = item_counts_first[item_counts_first['item_count'] == 1]['order_id']
single_payment_first = payment_counts_first[payment_counts_first['payment_count'] == 1]['order_id']
single_first_orders = set(single_item_first) & set(single_payment_first)
first_orders = first_orders[first_orders['order_id'].isin(single_first_orders)]

# Merge with items, products, and payments
first_orders = pd.merge(first_orders, items[['order_id', 'product_id', 'price', 'freight_value']], on='order_id')
first_orders = pd.merge(first_orders, products[['product_id', 'product_category_name']], on='product_id')
first_orders = pd.merge(first_orders, payments[['order_id', 'payment_type', 'payment_installments']], on='order_id')

# Encode categorical variables
encoder_task2 = OneHotEncoder(sparse_output=False, drop='first')
encoded_categories_task2 = encoder_task2.fit_transform(first_orders[['customer_state', 'product_category_name', 'payment_type']])
encoded_df_task2 = pd.DataFrame(encoded_categories_task2, columns=encoder_task2.get_feature_names_out())
first_orders = pd.concat([first_orders.reset_index(drop=True), encoded_df_task2], axis=1)
first_orders = first_orders.drop(['customer_state', 'product_category_name', 'payment_type', 'order_id', 'customer_id', 'customer_unique_id', 'order_purchase_timestamp', 'product_id'], axis=1)

# Select features and target
features_task2 = ['price', 'freight_value', 'payment_installments'] + list(encoded_df_task2.columns)
X_task2 = first_orders[features_task2]
y_task2 = first_orders['repeat_purchase']

# Split data into training and testing sets
X_train_task2, X_test_task2, y_train_task2, y_test_task2 = train_test_split(X_task2, y_task2, test_size=0.2, random_state=42)

# Scale features
scaler_task2 = StandardScaler()
X_train_scaled_task2 = scaler_task2.fit_transform(X_train_task2)
X_test_scaled_task2 = scaler_task2.transform(X_test_task2)

# Train binary logistic regression model
model_task2 = LogisticRegression(solver='lbfgs', max_iter=1000)
model_task2.fit(X_train_scaled_task2, y_train_task2)

# Predict and evaluate
y_pred_task2 = model_task2.predict(X_test_scaled_task2)
y_prob_task2 = model_task2.predict_proba(X_test_scaled_task2)[:, 1]
print("\nTask 2: Predict Repeat Purchases")
print("Accuracy:", accuracy_score(y_test_task2, y_pred_task2))
print("Precision:", precision_score(y_test_task2, y_pred_task2))
print("Recall:", recall_score(y_test_task2, y_pred_task2))
print("ROC-AUC:", roc_auc_score(y_test_task2, y_prob_task2))


Task 2: Predict Repeat Purchases
Accuracy: 0.9714952991197078
Precision: 0.0
Recall: 0.0
ROC-AUC: 0.6229213037615958


  _warn_prf(average, modifier, msg_start, len(result))
