In [27]:
import numpy as np
import sklearn
import pandas as pd

In [28]:
# seed for reproducibility(to ensure same data output everytime)
np.random.seed(42)

In [29]:
records=1000

In [30]:
cities=['Delhi','Bangalore','Pune','Chennai','Kashmir','Punjab','Dehradun','Gurugram','Agra','Kerela']
modes=["Truck","Air","Sea","Train"]
weather_conditions = ["Clear", "Bad", "Foggy", "Rain"]
traffic_conditions = ["Low", "Moderate", "High"]
past_performance = ["Good", "Poor"]


In [31]:
data={
    "Shipment_ID": range(1, records+1),
    "Mode": np.random.choice(modes,records),
    "Origin": np.random.choice(cities, records),
    "Destination": np.random.choice(cities, records),
    "Distance": np.random.randint(100,5001,records),
    "Delay_Status": np.random.choice([0,1],records,p=[0.7,0.3]),
    "Delay_Hours": np.random.randint(0,13,records),
    "Weather_Conditions": np.random.choice(weather_conditions,records),
    "Traffic_Condition": np.random.choice(traffic_conditions, records),
    "Past_Performance": np.random.choice(past_performance, records)
}

In [32]:
simulated_data=pd.DataFrame(data)
simulated_data

Unnamed: 0,Shipment_ID,Mode,Origin,Destination,Distance,Delay_Status,Delay_Hours,Weather_Conditions,Traffic_Condition,Past_Performance
0,1,Sea,Punjab,Dehradun,1954,0,4,Clear,Moderate,Good
1,2,Train,Delhi,Bangalore,2708,0,9,Clear,Moderate,Poor
2,3,Truck,Agra,Punjab,2179,0,3,Clear,Low,Good
3,4,Sea,Delhi,Dehradun,2424,0,5,Rain,High,Good
4,5,Sea,Kashmir,Kerela,4727,0,7,Bad,High,Poor
...,...,...,...,...,...,...,...,...,...,...
995,996,Truck,Agra,Gurugram,799,1,7,Clear,Low,Poor
996,997,Truck,Delhi,Punjab,1480,0,12,Rain,Low,Poor
997,998,Train,Chennai,Kerela,3996,1,2,Clear,Moderate,Good
998,999,Train,Kerela,Pune,1773,0,0,Clear,Low,Poor


In [33]:
simulated_data["Past_Performance"].value_counts()

Past_Performance
Good    500
Poor    500
Name: count, dtype: int64

In [34]:
simulated_data.loc[simulated_data["Delay_Status"]==0,"Delay_Hours"]=0
simulated_data.reset_index(drop=True,inplace=True)

In [38]:
sim=simulated_data
sim
sim.to_csv("dataset_sql.csv",encoding='utf-8',index=False)

In [11]:
#introducing random features for feature engg.
simulated_data['Random_Number']=np.random.randint(1,100,records)
simulated_data['Random_Category'] = np.random.choice(['A', 'B', 'C', 'D'], records)
simulated_data['Random_Timestamp'] = pd.date_range('2025-01-01', periods=records).to_series().sample(frac=1).values
simulated_data['Random_Text'] = np.random.choice(['Lorem', 'Ipsum', 'Dolor', 'Sit'], records)


In [12]:
simulated_data.head()

Unnamed: 0,Shipment_ID,Mode,Origin,Destination,Distance,Delay_Status,Delay_Hours,Weather_Conditions,Traffic_Condition,Past_Performance,Random_Number,Random_Category,Random_Timestamp,Random_Text
0,1,Sea,Punjab,Dehradun,1954,0,0,Clear,Moderate,Good,12,A,2025-12-18,Ipsum
1,2,Train,Delhi,Bangalore,2708,0,0,Clear,Moderate,Poor,66,C,2026-03-22,Dolor
2,3,Truck,Agra,Punjab,2179,0,0,Clear,Low,Good,2,A,2025-07-09,Ipsum
3,4,Sea,Delhi,Dehradun,2424,0,0,Rain,High,Good,15,D,2026-11-23,Sit
4,5,Sea,Kashmir,Kerela,4727,0,0,Bad,High,Poor,23,D,2025-02-27,Dolor


In [13]:
#one hot encoding(Nominal Data-Without any order... drop_first to prevent multicollinearity)
data=simulated_data['Mode'].unique()
data
df=pd.DataFrame(data,columns=["Mode"])
df
encoded_data=pd.get_dummies(simulated_data,columns=['Mode','Weather_Conditions'],drop_first=True)
encoded_data.head()


Unnamed: 0,Shipment_ID,Origin,Destination,Distance,Delay_Status,Delay_Hours,Traffic_Condition,Past_Performance,Random_Number,Random_Category,Random_Timestamp,Random_Text,Mode_Sea,Mode_Train,Mode_Truck,Weather_Conditions_Clear,Weather_Conditions_Foggy,Weather_Conditions_Rain
0,1,Punjab,Dehradun,1954,0,0,Moderate,Good,12,A,2025-12-18,Ipsum,True,False,False,True,False,False
1,2,Delhi,Bangalore,2708,0,0,Moderate,Poor,66,C,2026-03-22,Dolor,False,True,False,True,False,False
2,3,Agra,Punjab,2179,0,0,Low,Good,2,A,2025-07-09,Ipsum,False,False,True,True,False,False
3,4,Delhi,Dehradun,2424,0,0,High,Good,15,D,2026-11-23,Sit,True,False,False,False,False,True
4,5,Kashmir,Kerela,4727,0,0,High,Poor,23,D,2025-02-27,Dolor,True,False,False,False,False,False


In [14]:
#selecting only boolean columns and converting to integers
boolcol=encoded_data.select_dtypes(include=['bool']).columns
boolcol
encoded_data[boolcol]=encoded_data[boolcol].astype(int)
encoded_data.head()


Unnamed: 0,Shipment_ID,Origin,Destination,Distance,Delay_Status,Delay_Hours,Traffic_Condition,Past_Performance,Random_Number,Random_Category,Random_Timestamp,Random_Text,Mode_Sea,Mode_Train,Mode_Truck,Weather_Conditions_Clear,Weather_Conditions_Foggy,Weather_Conditions_Rain
0,1,Punjab,Dehradun,1954,0,0,Moderate,Good,12,A,2025-12-18,Ipsum,1,0,0,1,0,0
1,2,Delhi,Bangalore,2708,0,0,Moderate,Poor,66,C,2026-03-22,Dolor,0,1,0,1,0,0
2,3,Agra,Punjab,2179,0,0,Low,Good,2,A,2025-07-09,Ipsum,0,0,1,1,0,0
3,4,Delhi,Dehradun,2424,0,0,High,Good,15,D,2026-11-23,Sit,1,0,0,0,0,1
4,5,Kashmir,Kerela,4727,0,0,High,Poor,23,D,2025-02-27,Dolor,1,0,0,0,0,0


In [15]:
encoded_data.dtypes

Shipment_ID                          int64
Origin                              object
Destination                         object
Distance                             int32
Delay_Status                         int64
Delay_Hours                          int32
Traffic_Condition                   object
Past_Performance                    object
Random_Number                        int32
Random_Category                     object
Random_Timestamp            datetime64[ns]
Random_Text                         object
Mode_Sea                             int64
Mode_Train                           int64
Mode_Truck                           int64
Weather_Conditions_Clear             int64
Weather_Conditions_Foggy             int64
Weather_Conditions_Rain              int64
dtype: object

In [16]:
#checking for duplicated columns
encoded_cols=encoded_data.columns[encoded_data.columns.duplicated()]
encoded_cols

Index([], dtype='object')

In [17]:
encoded_data

Unnamed: 0,Shipment_ID,Origin,Destination,Distance,Delay_Status,Delay_Hours,Traffic_Condition,Past_Performance,Random_Number,Random_Category,Random_Timestamp,Random_Text,Mode_Sea,Mode_Train,Mode_Truck,Weather_Conditions_Clear,Weather_Conditions_Foggy,Weather_Conditions_Rain
0,1,Punjab,Dehradun,1954,0,0,Moderate,Good,12,A,2025-12-18,Ipsum,1,0,0,1,0,0
1,2,Delhi,Bangalore,2708,0,0,Moderate,Poor,66,C,2026-03-22,Dolor,0,1,0,1,0,0
2,3,Agra,Punjab,2179,0,0,Low,Good,2,A,2025-07-09,Ipsum,0,0,1,1,0,0
3,4,Delhi,Dehradun,2424,0,0,High,Good,15,D,2026-11-23,Sit,1,0,0,0,0,1
4,5,Kashmir,Kerela,4727,0,0,High,Poor,23,D,2025-02-27,Dolor,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,Agra,Gurugram,799,1,7,Low,Poor,76,D,2026-07-12,Dolor,0,0,1,1,0,0
996,997,Delhi,Punjab,1480,0,0,Low,Poor,1,B,2027-04-09,Lorem,0,0,1,0,0,1
997,998,Chennai,Kerela,3996,1,2,Moderate,Good,72,C,2025-06-30,Lorem,0,1,0,1,0,0
998,999,Kerela,Pune,1773,0,0,Low,Poor,48,D,2027-01-09,Lorem,0,1,0,1,0,0


In [18]:
'''#label encoding(traffic condition,past performance--ordered category)
from sklearn.preprocessing import LabelEncoder
traffic_encoder=LabelEncoder()
traffic_mapping={
    'Low':0,
    'Moderate':1,
    'High':2
}
encoded_data['Traffic_Condition']=traffic_encoder.fit_transform(encoded_data['Traffic_Condition'])
encoded_data
'''

"#label encoding(traffic condition,past performance--ordered category)\nfrom sklearn.preprocessing import LabelEncoder\ntraffic_encoder=LabelEncoder()\ntraffic_mapping={\n    'Low':0,\n    'Moderate':1,\n    'High':2\n}\nencoded_data['Traffic_Condition']=traffic_encoder.fit_transform(encoded_data['Traffic_Condition'])\nencoded_data\n"

In [19]:
traffic_mapping = {'Low': 0, 'Moderate': 1, 'High': 2}
encoded_data['Traffic_Condition'] = encoded_data['Traffic_Condition'].map(traffic_mapping)
encoded_data.head()


Unnamed: 0,Shipment_ID,Origin,Destination,Distance,Delay_Status,Delay_Hours,Traffic_Condition,Past_Performance,Random_Number,Random_Category,Random_Timestamp,Random_Text,Mode_Sea,Mode_Train,Mode_Truck,Weather_Conditions_Clear,Weather_Conditions_Foggy,Weather_Conditions_Rain
0,1,Punjab,Dehradun,1954,0,0,1,Good,12,A,2025-12-18,Ipsum,1,0,0,1,0,0
1,2,Delhi,Bangalore,2708,0,0,1,Poor,66,C,2026-03-22,Dolor,0,1,0,1,0,0
2,3,Agra,Punjab,2179,0,0,0,Good,2,A,2025-07-09,Ipsum,0,0,1,1,0,0
3,4,Delhi,Dehradun,2424,0,0,2,Good,15,D,2026-11-23,Sit,1,0,0,0,0,1
4,5,Kashmir,Kerela,4727,0,0,2,Poor,23,D,2025-02-27,Dolor,1,0,0,0,0,0


In [20]:
encoded_data.isnull().sum()


Shipment_ID                 0
Origin                      0
Destination                 0
Distance                    0
Delay_Status                0
Delay_Hours                 0
Traffic_Condition           0
Past_Performance            0
Random_Number               0
Random_Category             0
Random_Timestamp            0
Random_Text                 0
Mode_Sea                    0
Mode_Train                  0
Mode_Truck                  0
Weather_Conditions_Clear    0
Weather_Conditions_Foggy    0
Weather_Conditions_Rain     0
dtype: int64

In [21]:
mapp={'Good':1,'Poor':0}
encoded_data['Past_Performance']=encoded_data['Past_Performance'].map(mapp)
encoded_data.head()

Unnamed: 0,Shipment_ID,Origin,Destination,Distance,Delay_Status,Delay_Hours,Traffic_Condition,Past_Performance,Random_Number,Random_Category,Random_Timestamp,Random_Text,Mode_Sea,Mode_Train,Mode_Truck,Weather_Conditions_Clear,Weather_Conditions_Foggy,Weather_Conditions_Rain
0,1,Punjab,Dehradun,1954,0,0,1,1,12,A,2025-12-18,Ipsum,1,0,0,1,0,0
1,2,Delhi,Bangalore,2708,0,0,1,0,66,C,2026-03-22,Dolor,0,1,0,1,0,0
2,3,Agra,Punjab,2179,0,0,0,1,2,A,2025-07-09,Ipsum,0,0,1,1,0,0
3,4,Delhi,Dehradun,2424,0,0,2,1,15,D,2026-11-23,Sit,1,0,0,0,0,1
4,5,Kashmir,Kerela,4727,0,0,2,0,23,D,2025-02-27,Dolor,1,0,0,0,0,0


In [22]:
encoded_data.drop(columns=['Random_Number','Random_Category','Random_Timestamp',	'Random_Text'],inplace=True)


In [23]:
encoded_data.head()

Unnamed: 0,Shipment_ID,Origin,Destination,Distance,Delay_Status,Delay_Hours,Traffic_Condition,Past_Performance,Mode_Sea,Mode_Train,Mode_Truck,Weather_Conditions_Clear,Weather_Conditions_Foggy,Weather_Conditions_Rain
0,1,Punjab,Dehradun,1954,0,0,1,1,1,0,0,1,0,0
1,2,Delhi,Bangalore,2708,0,0,1,0,0,1,0,1,0,0
2,3,Agra,Punjab,2179,0,0,0,1,0,0,1,1,0,0
3,4,Delhi,Dehradun,2424,0,0,2,1,1,0,0,0,0,1
4,5,Kashmir,Kerela,4727,0,0,2,0,1,0,0,0,0,0


In [24]:
df=encoded_data[['Mode_Sea','Mode_Truck','Mode_Train','Distance','Delay_Status','Delay_Hours','Traffic_Condition','Weather_Conditions_Clear','Weather_Conditions_Foggy','Weather_Conditions_Rain']]
df.head()

Unnamed: 0,Mode_Sea,Mode_Truck,Mode_Train,Distance,Delay_Status,Delay_Hours,Traffic_Condition,Weather_Conditions_Clear,Weather_Conditions_Foggy,Weather_Conditions_Rain
0,1,0,0,1954,0,0,1,1,0,0
1,0,0,1,2708,0,0,1,1,0,0
2,0,1,0,2179,0,0,0,1,0,0
3,1,0,0,2424,0,0,2,0,0,1
4,1,0,0,4727,0,0,2,0,0,0


In [25]:
df.isna().sum()
df.isnull().sum()

Mode_Sea                    0
Mode_Truck                  0
Mode_Train                  0
Distance                    0
Delay_Status                0
Delay_Hours                 0
Traffic_Condition           0
Weather_Conditions_Clear    0
Weather_Conditions_Foggy    0
Weather_Conditions_Rain     0
dtype: int64

In [26]:
df.to_csv('dataset',encoding='utf-8')