In [37]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [38]:
# Step 1: Load data
df = pd.read_csv('/content/shipping.csv')
df

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10994,10995,A,Ship,4,1,252,5,medium,F,1,1538,1
10995,10996,B,Ship,4,1,232,5,medium,F,6,1247,0
10996,10997,C,Ship,5,4,242,5,low,F,4,1155,0
10997,10998,F,Ship,5,2,223,6,medium,M,2,1210,0


In [39]:
# Step 2: Handle missing values (if any)
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10999 non-null  int64 
 1   Warehouse_block      10999 non-null  object
 2   Mode_of_Shipment     10999 non-null  object
 3   Customer_care_calls  10999 non-null  int64 
 4   Customer_rating      10999 non-null  int64 
 5   Cost_of_the_Product  10999 non-null  int64 
 6   Prior_purchases      10999 non-null  int64 
 7   Product_importance   10999 non-null  object
 8   Gender               10999 non-null  object
 9   Discount_offered     10999 non-null  int64 
 10  Weight_in_gms        10999 non-null  int64 
 11  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 3.2 MB


In [40]:
# Renaming the Reached.on.Time_Y.N to be more readable
df.rename(columns={"Reached.on.Time_Y.N":"on_time"}, inplace=True)

# Lowercase all column names
df = df.rename(columns={col: col.lower() for col in df.columns})

In [47]:
for column in df.columns:
    unique_values = df[column].nunique()
    print(f"Unique values in {column}: {unique_values}")

Unique values in id: 10999
Unique values in warehouse_block: 5
Unique values in mode_of_shipment: 3
Unique values in customer_care_calls: 6
Unique values in customer_rating: 5
Unique values in cost_of_the_product: 215
Unique values in prior_purchases: 8
Unique values in product_importance: 3
Unique values in gender: 2
Unique values in discount_offered: 65
Unique values in weight_in_gms: 4034
Unique values in on_time: 2


In [41]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,10999.0,5500.0,3175.28214,1.0,2750.5,5500.0,8249.5,10999.0
customer_care_calls,10999.0,4.054459,1.14149,2.0,3.0,4.0,5.0,7.0
customer_rating,10999.0,2.990545,1.413603,1.0,2.0,3.0,4.0,5.0
cost_of_the_product,10999.0,210.196836,48.063272,96.0,169.0,214.0,251.0,310.0
prior_purchases,10999.0,3.567597,1.52286,2.0,3.0,3.0,4.0,10.0
discount_offered,10999.0,13.373216,16.205527,1.0,4.0,7.0,10.0,65.0
weight_in_gms,10999.0,3634.016729,1635.377251,1001.0,1839.5,4149.0,5050.0,7846.0
on_time,10999.0,0.596691,0.490584,0.0,0.0,1.0,1.0,1.0


In [43]:
# Step 3: Normalize numerical features
scaler = MinMaxScaler()
numerical_features = df.select_dtypes(include=np.number).columns
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [44]:
df

Unnamed: 0,id,warehouse_block,mode_of_shipment,customer_care_calls,customer_rating,cost_of_the_product,prior_purchases,product_importance,gender,discount_offered,weight_in_gms,on_time
0,0.000000,D,Flight,0.4,0.25,0.378505,0.125,low,F,0.671875,0.033893,1.0
1,0.000091,F,Flight,0.4,1.00,0.560748,0.000,low,M,0.906250,0.304894,1.0
2,0.000182,A,Flight,0.0,0.25,0.406542,0.250,low,M,0.734375,0.346676,1.0
3,0.000273,B,Flight,0.2,0.50,0.373832,0.250,medium,M,0.140625,0.025712,1.0
4,0.000364,C,Flight,0.0,0.25,0.411215,0.125,medium,F,0.703125,0.216654,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10994,0.999636,A,Ship,0.4,0.00,0.728972,0.375,medium,F,0.000000,0.078451,1.0
10995,0.999727,B,Ship,0.4,0.00,0.635514,0.375,medium,F,0.078125,0.035939,0.0
10996,0.999818,C,Ship,0.6,0.75,0.682243,0.375,low,F,0.046875,0.022498,0.0
10997,0.999909,F,Ship,0.6,0.25,0.593458,0.500,medium,M,0.015625,0.030533,0.0


In [51]:
# Step 4: Label encode non-numerical features
encoder = LabelEncoder()
categorical_feature = df.select_dtypes(exclude=np.number).columns

for col in categorical_feature:
  df[col] = encoder.fit_transform(df[col])

In [52]:
df

Unnamed: 0,id,warehouse_block,mode_of_shipment,customer_care_calls,customer_rating,cost_of_the_product,prior_purchases,product_importance,gender,discount_offered,weight_in_gms,on_time
0,0.000000,3,0,0.4,0.25,0.378505,0.125,1,0,0.671875,0.033893,1.0
1,0.000091,4,0,0.4,1.00,0.560748,0.000,1,1,0.906250,0.304894,1.0
2,0.000182,0,0,0.0,0.25,0.406542,0.250,1,1,0.734375,0.346676,1.0
3,0.000273,1,0,0.2,0.50,0.373832,0.250,2,1,0.140625,0.025712,1.0
4,0.000364,2,0,0.0,0.25,0.411215,0.125,2,0,0.703125,0.216654,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10994,0.999636,0,2,0.4,0.00,0.728972,0.375,2,0,0.000000,0.078451,1.0
10995,0.999727,1,2,0.4,0.00,0.635514,0.375,2,0,0.078125,0.035939,0.0
10996,0.999818,2,2,0.6,0.75,0.682243,0.375,1,0,0.046875,0.022498,0.0
10997,0.999909,4,2,0.6,0.25,0.593458,0.500,2,1,0.015625,0.030533,0.0


In [57]:
# Step 5: Split the data
X = df.drop('on_time', axis=1)
y = df['on_time']
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)

In [60]:
print(
    f'X_train: {X_train.shape = }'
    f'\nX_val: {X_val.shape = }'
    f'\nX_test: {X_test.shape = }'
    f'\ny_train: {y_train.shape = }'
    f'\ny_val: {y_val.shape = }'
    f'\ny_test: {y_test.shape = }'
)

X_train: X_train.shape = (7699, 11)
X_val: X_val.shape = (1650, 11)
X_test: X_test.shape = (1650, 11)
y_train: y_train.shape = (7699,)
y_val: y_val.shape = (1650,)
y_test: y_test.shape = (1650,)


In [61]:
# Step 6: Build the linear regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [62]:
# Make predictions
y_pred = model.predict(X_test)

In [63]:
y_pred

array([0., 1., 0., ..., 1., 1., 1.])

In [64]:
# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.6496969696969697


In [65]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[389 244]
 [334 683]]


In [66]:
# Classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Classification Report:
              precision    recall  f1-score   support

         0.0       0.54      0.61      0.57       633
         1.0       0.74      0.67      0.70      1017

    accuracy                           0.65      1650
   macro avg       0.64      0.64      0.64      1650
weighted avg       0.66      0.65      0.65      1650

