# Batch Machine Learning Pipeline

In [20]:
# Batch ML Pipeline includes below :
# 1.Data Loading 2.Preprocessing 3.Model Training 4.Batch Prediction 5.Saving Output

In [None]:
# Step 1: Import Libraries.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
from datetime import datetime

In [22]:
# Step 2: Simulate loading batch data
# Generating synthetic data to mimic a credit scoring batch

np.random.seed(42)
data = pd.DataFrame({
    'age': np.random.randint(21, 60, 1000),
    'income': np.random.normal(50000, 15000, 1000),
    'credit_score': np.random.randint(300, 850, 1000),
    'payment_delay': np.random.randint(0, 5, 1000),
    'default': np.random.choice([0, 1], 1000, p=[0.85, 0.15])
})
data.head(10)

Unnamed: 0,age,income,credit_score,payment_delay,default
0,59,44963.229511,843,0,0
1,49,75035.322879,823,2,0
2,35,46106.12973,315,3,0
3,28,27452.855703,428,2,1
4,41,46313.854039,408,3,0
5,59,45909.146454,832,1,1
6,39,9546.700356,568,4,0
7,43,49185.577002,461,3,0
8,31,46535.982047,559,0,0
9,31,60443.095472,401,2,0


In [23]:
# Step 3: Split and train model
X = data.drop('default', axis=1)
y = data['default']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train,y_train)
joblib.dump(model, 'batch_model.pkl')
print('Model trained and saved.')

Model trained and saved.


In [24]:
# Step 4: Simulate a new batch for prediction
batch_data = pd.DataFrame({
    'age': np.random.randint(21, 60, 10),
    'income': np.random.normal(50000, 15000, 10),
    'credit_score': np.random.randint(300, 850, 10),
    'payment_delay': np.random.randint(0, 5, 10)
})
batch_data

Unnamed: 0,age,income,credit_score,payment_delay
0,37,42946.346115,406,3
1,50,43564.041846,564,0
2,31,46306.368171,504,1
3,41,58935.533753,378,4
4,41,69889.549557,454,2
5,38,40207.872737,546,1
6,49,55718.663997,830,0
7,44,47961.090652,837,1
8,30,59271.636538,453,2
9,37,72641.865401,453,0


In [25]:
# Step 5: Load model and predict
model = joblib.load('batch_model.pkl')
predictions = model.predict(batch_data)
batch_data['predicted_default'] = predictions
batch_data['processed_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
batch_data

Unnamed: 0,age,income,credit_score,payment_delay,predicted_default,processed_time
0,37,42946.346115,406,3,0,2025-07-28 19:46:49
1,50,43564.041846,564,0,0,2025-07-28 19:46:49
2,31,46306.368171,504,1,0,2025-07-28 19:46:49
3,41,58935.533753,378,4,0,2025-07-28 19:46:49
4,41,69889.549557,454,2,0,2025-07-28 19:46:49
5,38,40207.872737,546,1,0,2025-07-28 19:46:49
6,49,55718.663997,830,0,0,2025-07-28 19:46:49
7,44,47961.090652,837,1,0,2025-07-28 19:46:49
8,30,59271.636538,453,2,0,2025-07-28 19:46:49
9,37,72641.865401,453,0,0,2025-07-28 19:46:49
