<a href="https://colab.research.google.com/github/Kellozr/classification-using-XGboost-on-Financial-data/blob/main/Customer_data_forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
import pandas as pd
uploaded = files.upload()
filename = next(iter(uploaded))
df = pd.read_csv(filename)
df.head()


KeyboardInterrupt: 

In [None]:
import numpy as np

# Create sample categories
channels = ['UPI', 'NEFT', 'IMPS', 'RTGS', 'Cheque', 'Cash']
sectors = ['Retail', 'Manufacturing', 'IT', 'Finance', 'Healthcare', 'Agriculture']
purposes = ['Salary', 'Loan Repayment', 'Purchase', 'Bill Payment', 'Transfer', 'Rent']

# Add random transaction channel
df['transaction_channel'] = np.random.choice(channels, size=len(df))

# Random customer sector
df['customer_sector'] = np.random.choice(sectors, size=len(df))

# Purpose of transaction
df['purpose'] = np.random.choice(purposes, size=len(df))

# Create inflow/outflow column
def classify_transaction(row):
    diff = row['newbalanceOrig'] - row['oldbalanceOrg']
    if diff > 0:
        return 'Inflow'
    elif diff < 0:
        return 'Outflow'
    else:
        return 'Neutral'

df['transaction_type'] = df.apply(classify_transaction, axis=1)

# Estimate daily balance if step = day number
df['daily_balance_estimate'] = df['newbalanceOrig']

# Show some samples
df.sample(5)


Add `%load_ext cudf.pandas` before importing pandas to speed up operations using GPU

In [None]:
%load_ext cudf.pandas
import pandas as pd
import numpy as np

# Randomly generated dataset of parking violations-
# Define the number of rows
num_rows = 1000000

states = ["NY", "NJ", "CA", "TX"]
violations = ["Double Parking", "Expired Meter", "No Parking",
              "Fire Hydrant", "Bus Stop"]
vehicle_types = ["SUBN", "SDN"]

# Create a date range
start_date = "2022-01-01"
end_date = "2022-12-31"
dates = pd.date_range(start=start_date, end=end_date, freq='D')

# Generate random data
data = {
    "Registration State": np.random.choice(states, size=num_rows),
    "Violation Description": np.random.choice(violations, size=num_rows),
    "Vehicle Body Type": np.random.choice(vehicle_types, size=num_rows),
    "Issue Date": np.random.choice(dates, size=num_rows),
    "Ticket Number": np.random.randint(1000000000, 9999999999, size=num_rows)
}

# Create a DataFrame
df = pd.DataFrame(data)

# Which parking violation is most commonly committed by vehicles from various U.S states?

(df[["Registration State", "Violation Description"]]  # get only these two columns
 .value_counts()  # get the count of offences per state and per type of offence
 .groupby("Registration State")  # group by state
 .head(1)  # get the first row in each group (the type of offence with the largest count)
 .sort_index()  # sort by state name
 .reset_index()
)

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Seed for reproducibility
random.seed(42)
np.random.seed(42)

# Config
num_customers = 50
transactions_per_customer = 60  # about 2 transactions per day for 30 days
start_date = datetime(2024, 5, 1)

# Sample values
industries = ['Retail', 'Manufacturing', 'IT', 'Finance', 'Healthcare', 'Agriculture']
inflow_purposes = ['Loan Credit', 'Customer Payment', 'Investment Return']
outflow_purposes = ['Vendor Payment', 'Salary Payment', 'Loan Repayment', 'Utility Bill']
channels = ['Online', 'Branch', 'ATM', 'POS']

data = []

for cust_id in range(1, num_customers + 1):
    customer = f"CUST{1000 + cust_id}"
    industry = random.choice(industries)
    base_balance = random.randint(50000, 200000)

    for i in range(transactions_per_customer):
        txn_date = start_date + timedelta(days=i // 2)
        txn_type = random.choice(['Inflow', 'Outflow'])
        channel = random.choice(channels)

        if txn_type == 'Inflow':
            purpose = random.choice(inflow_purposes)
            amount = round(np.random.normal(loc=10000, scale=3000), 2)
        else:
            purpose = random.choice(outflow_purposes)
            amount = round(np.random.normal(loc=8000, scale=2500), 2)

        amount = max(500, amount)  # No zero/negative amounts
        new_balance = base_balance + amount if txn_type == 'Inflow' else base_balance - amount

        data.append({
            'date': txn_date.date(),
            'customer_id': customer,
            'industry': industry,
            'transaction_type': txn_type,
            'purpose': purpose,
            'channel': channel,
            'amount': round(amount, 2),
            'old_balance': round(base_balance, 2),
            'new_balance': round(new_balance, 2)
        })

        # Update balance for next txn
        base_balance = new_balance

# Convert to DataFrame
df = pd.DataFrame(data)

# Show first 10 records
df.head(10)


Unnamed: 0,date,customer_id,industry,transaction_type,purpose,channel,amount,old_balance,new_balance
0,2024-05-01,CUST1001,Agriculture,Inflow,Loan Credit,ATM,11490.14,79184.0,90674.14
1,2024-05-01,CUST1001,Agriculture,Inflow,Investment Return,Branch,9585.21,90674.14,100259.35
2,2024-05-02,CUST1001,Agriculture,Inflow,Investment Return,Online,11943.07,100259.35,112202.42
3,2024-05-02,CUST1001,Agriculture,Outflow,Vendor Payment,Online,11807.57,112202.42,100394.85
4,2024-05-03,CUST1001,Agriculture,Inflow,Loan Credit,Branch,9297.54,100394.85,109692.39
5,2024-05-03,CUST1001,Agriculture,Inflow,Investment Return,Branch,9297.59,109692.39,118989.98
6,2024-05-04,CUST1001,Agriculture,Outflow,Utility Bill,Branch,11948.03,118989.98,107041.95
7,2024-05-04,CUST1001,Agriculture,Outflow,Salary Payment,Online,9918.59,107041.95,97123.36
8,2024-05-05,CUST1001,Agriculture,Outflow,Loan Repayment,ATM,6826.31,97123.36,90297.05
9,2024-05-05,CUST1001,Agriculture,Inflow,Customer Payment,Branch,11627.68,90297.05,101924.73


In [None]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Configuration
num_customers = 1000

industries = ['Retail', 'Manufacturing', 'IT', 'Finance', 'Healthcare', 'Agriculture']

data = []

for i in range(1, num_customers + 1):
    customer_id = f"CUST{1000 + i}"
    industry = random.choice(industries)

    num_inflows = random.randint(5, 15)
    num_outflows = random.randint(5, 15)

    inflows = np.random.normal(loc=10000, scale=3000, size=num_inflows)
    outflows = np.random.normal(loc=8000, scale=2500, size=num_outflows)

    inflows = [max(500, round(val, 2)) for val in inflows]
    outflows = [max(500, round(val, 2)) for val in outflows]

    total_inflow = round(sum(inflows), 2)
    total_outflow = round(sum(outflows), 2)
    avg_inflow = round(np.mean(inflows), 2)
    avg_outflow = round(np.mean(outflows), 2)

    old_balances = np.random.uniform(50000, 200000, num_inflows + num_outflows)
    new_balances = old_balances + np.concatenate([inflows, -np.array(outflows)])

    avg_old_balance = round(np.mean(old_balances), 2)
    avg_new_balance = round(np.mean(new_balances), 2)

    data.append({
        "customer_id": customer_id,
        "industry": industry,
        "total_inflow": total_inflow,
        "avg_inflow": avg_inflow,
        "num_inflows": num_inflows,
        "total_outflow": total_outflow,
        "avg_outflow": avg_outflow,
        "num_outflows": num_outflows,
        "avg_old_balance": avg_old_balance,
        "avg_new_balance": avg_new_balance
    })

# Create DataFrame
df_summary = pd.DataFrame(data)

# Save to CSV in Colab filesystem
csv_path = "/content/customer_summary_data.csv"
df_summary.to_csv(csv_path, index=False)

print(f"✅ CSV file saved to: {csv_path}")


✅ CSV file saved to: /content/customer_summary_data.csv


In [None]:
import pandas as pd

# Load data (if saved as CSV)
df = pd.read_csv("/content/customer_summary_data.csv")  # or use pd.read_csv("your_file.csv")

# Show first few rows
print(df.head())

# Data overview
print("\n🔹 Data Summary:")
print(df.describe())

# Check industry distribution
print("\n🔹 Industry Distribution:")
print(df['industry'].value_counts())

# Correlation matrix
print("\n🔹 Correlations:")
print(df.corr(numeric_only=True))

# Average inflow vs outflow
print("\n🔹 Average Inflow vs Outflow:")
print("Avg Inflow:", df['avg_inflow'].mean())
print("Avg Outflow:", df['avg_outflow'].mean())

# Difference between balances
df["balance_change"] = df["avg_new_balance"] - df["avg_old_balance"]
print("\n🔹 Sample Balance Change:")
print(df[["customer_id", "balance_change"]].head())


  customer_id       industry  total_inflow  avg_inflow  num_inflows  \
0    CUST1001    Agriculture      66182.64    11030.44            6   
1    CUST1002    Agriculture      96292.30    10699.14            9   
2    CUST1003  Manufacturing      68455.99     9779.43            7   
3    CUST1004    Agriculture     121561.19     9350.86           13   
4    CUST1005     Healthcare     116509.33    10591.76           11   

   total_outflow  avg_outflow  num_outflows  avg_old_balance  avg_new_balance  
0       44890.79      8978.16             5        110040.17        111975.79  
1       59660.71      7457.59             8        128097.07        130251.87  
2       49526.21      8254.37             6        134290.24        135746.37  
3       57446.70      9574.45             6        121810.27        125184.72  
4       44672.17      8934.43             5        110644.65        115134.48  

🔹 Data Summary:
        total_inflow    avg_inflow  num_inflows  total_outflow   avg_outflow

In [None]:
df = pd.read_csv('/content/customer_summary_data.csv')
df.head()

Unnamed: 0,customer_id,industry,total_inflow,avg_inflow,num_inflows,total_outflow,avg_outflow,num_outflows,avg_old_balance,avg_new_balance
0,CUST1001,Agriculture,66182.64,11030.44,6,44890.79,8978.16,5,110040.17,111975.79
1,CUST1002,Agriculture,96292.3,10699.14,9,59660.71,7457.59,8,128097.07,130251.87
2,CUST1003,Manufacturing,68455.99,9779.43,7,49526.21,8254.37,6,134290.24,135746.37
3,CUST1004,Agriculture,121561.19,9350.86,13,57446.7,9574.45,6,121810.27,125184.72
4,CUST1005,Healthcare,116509.33,10591.76,11,44672.17,8934.43,5,110644.65,115134.48


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np # Import numpy to use sqrt

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
# Calculate RMSE from MSE
rmse = np.sqrt(mse)

print("RMSE:", rmse)
print("R² Score:", r2_score(y_test, y_pred))

RMSE: 27620.19242704841
R² Score: -0.07544461645026446


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np # Import numpy to use sqrt

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
# Calculate RMSE from MSE
rmse = np.sqrt(mse)

print("RMSE:", rmse)
print("R² Score:", r2_score(y_test, y_pred))

RMSE: 27620.19242704841
R² Score: -0.07544461645026446


In [14]:
new_customer = pd.DataFrame({
    'industry_encoded': [le.transform(['Healthcare'])[0]],  # or any other industry
    'total_inflow': [97999.5],
    'avg_inflow': [	10888.83],
    'num_inflows': [9]
})

predicted_outflow = model.predict(new_customer)
print("Predicted Total Outflow:", predicted_outflow[0])


Predicted Total Outflow: 73396.712


In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

# Assuming your data is loaded into df
# X = features dataframe, y = target series

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create dataset for LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Parameters (you can tune these)
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'verbose': -1,
    'seed': 42
}

# Define early stopping callback
# 'test_data' is the evaluation set, '50' is the number of boosting rounds
early_stopping_callback = lgb.early_stopping(stopping_rounds=50, verbose=10)

# Train model
# Pass the callback to the 'callbacks' parameter
model = lgb.train(params, train_data, valid_sets=[test_data], callbacks=[early_stopping_callback])

# Predict
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[15]	valid_0's rmse: 26489.3
RMSE: 26489.26333286533
R² Score: 0.010822085042175966


In [2]:
import pandas as pd
import numpy as np
import random
from google.colab import files  # Only needed in Google Colab

# Reproducibility
np.random.seed(42)
random.seed(42)

# Sample categories
industries = ['Retail', 'Manufacturing', 'IT', 'Finance', 'Healthcare', 'Agriculture']

# Patterned data generation
data = []

# Change the range to generate 10 million data points
for i in range(1, 1000001):
    # Adjust customer ID formatting for larger numbers
    customer_id = f"CUST{i:08d}"
    industry = random.choice(industries)

    # Set base inflow and outflow ranges per industry
    if industry == 'Finance':
        inflow_range = (100000, 200000)
        outflow_range = (80000, 180000)
    elif industry == 'Agriculture':
        inflow_range = (60000, 120000)
        outflow_range = (30000, 80000)
    elif industry == 'IT':
        inflow_range = (80000, 160000)
        outflow_range = (60000, 140000)
    elif industry == 'Retail':
        inflow_range = (70000, 140000)
        outflow_range = (50000, 130000)
    elif industry == 'Manufacturing':
        inflow_range = (90000, 180000)
        outflow_range = (70000, 160000)
    else:  # Healthcare
        inflow_range = (85000, 170000)
        outflow_range = (65000, 150000)

    # Generate inflow and outflow data
    num_inflows = random.randint(5, 15)
    num_outflows = random.randint(5, 15)

    inflows = np.random.normal(loc=np.mean(inflow_range), scale=5000, size=num_inflows)
    outflows = np.random.normal(loc=np.mean(outflow_range), scale=5000, size=num_outflows)

    inflows = np.clip(inflows, 1000, None)
    outflows = np.clip(outflows, 1000, None)

    total_inflow = round(np.sum(inflows), 2)
    avg_inflow = round(np.mean(inflows), 2)
    total_outflow = round(np.sum(outflows), 2)
    avg_outflow = round(np.mean(outflows), 2)

    # Simulate average balances
    avg_old_balance = round(np.random.uniform(80000, 150000), 2)
    avg_new_balance = round(avg_old_balance + (total_inflow - total_outflow) / random.uniform(8, 15), 2)

    data.append({
        'customer_id': customer_id,
        'industry': industry,
        'total_inflow': total_inflow,
        'avg_inflow': avg_inflow,
        'num_inflows': num_inflows,
        'total_outflow': total_outflow,
        'avg_outflow': avg_outflow,
        'num_outflows': num_outflows,
        'avg_old_balance': avg_old_balance,
        'avg_new_balance': avg_new_balance
    })

# Convert to DataFrame
df = pd.DataFrame(data)

# Save as CSV (be aware of the file size for 10 million rows)
df.to_csv('simulated_customer_cashflow_10M.csv', index=False)

# Download file (works in Google Colab)
# files.download('simulated_customer_cashflow_10M.csv')

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

df = pd.read_csv("simulated_customer_cashflow.csv")

X = df.drop(columns=['customer_id', 'total_outflow'])  # input features
y = df['total_outflow']  # target

categorical_features = ['industry']
numerical_features = [col for col in X.columns if col not in categorical_features]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), categorical_features)
], remainder='passthrough')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = make_pipeline(
    preprocessor,
    RandomForestRegressor(n_estimators=100, random_state=42)
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("✅ Regression Model Performance:")
print("RMSE (Root Mean Squared Error):", round(rmse, 2))
print("R² Score:", round(r2, 4))


✅ Regression Model Performance:
RMSE (Root Mean Squared Error): 11185.25
R² Score: 0.9993


In [20]:

new_customer_data = pd.DataFrame({
    'industry': ['Finance'],  # Example industry
    'total_inflow': [2110599.82],
    'avg_inflow': [150757.13],
    'num_inflows': [14],
    'avg_old_balance': [128365.64],
    'avg_new_balance': [197866.88],
    'num_outflows': [9],
    'avg_outflow': [128977.21]

})

predicted_outflow = model.predict(new_customer_data)

print("Predicted Total Outflow for the new customer:", predicted_outflow[0])

Predicted Total Outflow for the new customer: 1157877.590900002


first task apply single classification using **XGboot**

In [3]:
import pandas as pd
df=pd.read_csv('/content/simulated_transaction.csv')

In [5]:
col_to_drop=['customer_id'
             ]
df=df.drop(columns=col_to_drop).rename(
    columns={'industry':'industry_ID','total_inflow':'totalin','avg_inflow':'avgin','num_inflows':'noofin','total_outflow':'totalout','avg_outflow':'avgout','num_outflows':'noout','avg_old_balance':'oldbal','avg_new_balance':'newbal'}) # removed inplace=True for potentially chained operations, though it works here

In [24]:
df['totalout'] = pd.to_numeric(df['totalout'], errors='coerce')

In [25]:
df.head()

Unnamed: 0,industry_ID,totalin,avgin,noofin,totalout,avgout,noout,oldbal,newbal
0,Agriculture,550304.39,91717.4,6,0,56956.32,5,101296.96,121426.27
1,Manufacturing,1056454.32,132056.79,8,0,115277.95,7,91936.69,110903.09
2,Agriculture,1153725.34,88748.1,13,0,52579.68,6,99665.42,168751.08
3,Retail,532654.35,106530.87,5,0,90341.82,6,84449.08,83463.13
4,Healthcare,1795723.35,128265.95,14,0,105172.56,5,108726.8,215180.89


In [26]:
average_outflow = df['totalout'].mean()
print("Average Total Outflow:", average_outflow)


Average Total Outflow: 0.0


In [27]:
df['totalout'] = df['totalout'].apply(lambda x: 1 if x > 497951.06987226 else 0)
df.head(20)

Unnamed: 0,industry_ID,totalin,avgin,noofin,totalout,avgout,noout,oldbal,newbal
0,Agriculture,550304.39,91717.4,6,0,56956.32,5,101296.96,121426.27
1,Manufacturing,1056454.32,132056.79,8,0,115277.95,7,91936.69,110903.09
2,Agriculture,1153725.34,88748.1,13,0,52579.68,6,99665.42,168751.08
3,Retail,532654.35,106530.87,5,0,90341.82,6,84449.08,83463.13
4,Healthcare,1795723.35,128265.95,14,0,105172.56,5,108726.8,215180.89
5,Agriculture,1365952.98,91063.53,15,0,53811.28,13,101061.48,161994.86
6,Finance,2110599.82,150757.13,14,0,128977.21,9,128365.64,197866.88
7,Retail,737145.36,105306.48,7,0,92689.45,11,85889.8,58684.51
8,Manufacturing,1085979.91,135747.49,8,0,115864.24,10,148110.74,139773.56
9,Finance,926073.04,154345.51,6,0,129709.37,10,106956.84,80326.9


In [29]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/content/simulated_transaction.csv')

# Drop unneeded columns
col_to_drop = ['customer_id']
df = df.drop(columns=col_to_drop)

# Rename columns for clarity
df = df.rename(columns={
    'industry': 'industry_ID',
    'total_inflow': 'totalin',
    'avg_inflow': 'avgin',
    'num_inflows': 'noofin',
    'total_outflow': 'totalout',
    'avg_outflow': 'avgout',
    'num_outflows': 'noout',
    'avg_old_balance': 'oldbal',
    'avg_new_balance': 'newbal'
})

# Ensure 'totalout' is numeric
df['totalout'] = pd.to_numeric(df['totalout'], errors='coerce')

# Remove rows with NaN in 'totalout'
df = df.dropna(subset=['totalout'])

# Calculate the average of totalout
average_outflow = df['totalout'].mean()
print("Average Total Outflow:", average_outflow)

# Create a binary classification column: 1 if totalout > avg, else 0
df['totalout'] = df['totalout'].apply(lambda x: 1 if x > average_outflow else 0)

# Check how many 0s and 1s you have
print("\nLabel Distribution:")
print(df['totalout'].value_counts())

# Preview updated DataFrame
df.head(10)


Average Total Outflow: 995902.1397445295

Label Distribution:
totalout
0    530499
1    469501
Name: count, dtype: int64


Unnamed: 0,industry_ID,totalin,avgin,noofin,totalout,avgout,noout,oldbal,newbal
0,Agriculture,550304.39,91717.4,6,0,56956.32,5,101296.96,121426.27
1,Manufacturing,1056454.32,132056.79,8,0,115277.95,7,91936.69,110903.09
2,Agriculture,1153725.34,88748.1,13,0,52579.68,6,99665.42,168751.08
3,Retail,532654.35,106530.87,5,0,90341.82,6,84449.08,83463.13
4,Healthcare,1795723.35,128265.95,14,0,105172.56,5,108726.8,215180.89
5,Agriculture,1365952.98,91063.53,15,0,53811.28,13,101061.48,161994.86
6,Finance,2110599.82,150757.13,14,1,128977.21,9,128365.64,197866.88
7,Retail,737145.36,105306.48,7,1,92689.45,11,85889.8,58684.51
8,Manufacturing,1085979.91,135747.49,8,1,115864.24,10,148110.74,139773.56
9,Finance,926073.04,154345.51,6,1,129709.37,10,106956.84,80326.9


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

x=df.drop(columns='totalout')
y=df['totalout']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [38]:
!pip install category_encoders

from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
import pandas as pd # Import pandas as it's used later

# Assuming df, x, y, x_train, x_test, y_train, y_test are already defined from previous cells

estimators=[
    # Apply TargetEncoder only to the categorical column 'industry_ID'
    ('encoders', TargetEncoder(cols=['industry_ID'])),
    ('classifier',XGBClassifier(random_state=42))
]

# Correctly define the pipeline object
model = Pipeline(estimators)

# You had 'pipe' twice, remove the extra line
# pipe

# Now you can train the model
model.fit(x_train, y_train)

# Predict on the test set
y_pred = model.predict(x_test)

# Evaluate the model (you'll need to import metrics)
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1
Accuracy: 0.99962
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    106022
           1       1.00      1.00      1.00     93978

    accuracy                           1.00    200000
   macro avg       1.00      1.00      1.00    200000
weighted avg       1.00      1.00      1.00    200000



copy paste hyepr parameter code


hyper paarmeter copy paste

In [41]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Define the hyperparameter search space
search_space = {
    'classifier__max_depth': Integer(2, 8),
    'classifier__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'classifier__n_estimators': Integer(50, 300),
    'classifier__subsample': Real(0.6, 1.0),
    'classifier__colsample_bytree': Real(0.6, 1.0)
}

# Create the BayesSearchCV object
opt = BayesSearchCV(
    estimator=model,                 # your pipeline with XGBClassifier
    search_spaces=search_space,
    cv=3,                            # 3-fold cross-validation
    n_iter=10,                       # try 10 combinations
    scoring='roc_auc',              # scoring method
    random_state=0
)

# Fit to training data
opt.fit(x_train, y_train)


KeyboardInterrupt: 

Hyper parameter

In [39]:
# Install required libraries
!pip install category_encoders scikit-optimize

# Imports
import pandas as pd
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Assuming df is already loaded and preprocessed with your binary target column `totalout`
# Example: df['totalout'] = df['totalout'].apply(lambda x: 1 if x > avg else 0)

# Splitting features and target
x = df.drop(columns=['totalout'])
y = df['totalout']

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('encoders', TargetEncoder(cols=['industry_ID'])),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

# Define search space for XGBoost parameters
search_space = {
    'classifier__max_depth': Integer(3, 10),
    'classifier__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'classifier__n_estimators': Integer(50, 300),
    'classifier__subsample': Real(0.6, 1.0),
    'classifier__colsample_bytree': Real(0.6, 1.0)
}

# Setup BayesSearchCV
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    n_iter=20,  # Try 20 different combinations
    cv=3,
    scoring='accuracy',  # Use 'roc_auc' if class imbalance
    random_state=42,
    n_jobs=-1
)

# Fit the model with hyperparameter tuning
opt.fit(x_train, y_train)

# Predictions and evaluation
y_pred = opt.predict(x_test)

# Print results
print("Best Parameters:", opt.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.5.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.5.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.5.0 scikit-optimize-0.10.2


Parameters: { "use_label_encoder" } are not used.



Best Parameters: OrderedDict([('classifier__colsample_bytree', 0.6857240397023424), ('classifier__learning_rate', 0.03458800822411843), ('classifier__max_depth', 4), ('classifier__n_estimators', 293), ('classifier__subsample', 0.89627933856625)])
Accuracy: 0.99971
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    106022
           1       1.00      1.00      1.00     93978

    accuracy                           1.00    200000
   macro avg       1.00      1.00      1.00    200000
weighted avg       1.00      1.00      1.00    200000



In [49]:
# Cross-validation best score
print("Best CV Score (Accuracy):", opt.best_score_) # Changed from opt.best_score
y_pred = opt.predict(x_test)  # Predicted labels (0 or 1)
y_proba = opt.predict_proba(x_test)[:, 1]  # Probability for class 1 (i.e., "good" customer)
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

AttributeError: 'BayesSearchCV' object has no attribute 'best_score_'

In [53]:
best_model = opt.best_estimator_

# Example prediction on new data (make sure it has the same columns as x_train)
# new_customer = pd.DataFrame({...})
# y_new_pred = best_model.predict(new_customer)


In [55]:
import joblib
joblib.dump(best_model, 'best_model.pkl')
# Change 'xgboost_pipeline_model.pk1' to 'best_model.pkl'
model = joblib.load('best_model.pkl')

In [59]:
df.head(10)

Unnamed: 0,industry_ID,totalin,avgin,noofin,totalout,avgout,noout,oldbal,newbal
0,Agriculture,550304.39,91717.4,6,0,56956.32,5,101296.96,121426.27
1,Manufacturing,1056454.32,132056.79,8,0,115277.95,7,91936.69,110903.09
2,Agriculture,1153725.34,88748.1,13,0,52579.68,6,99665.42,168751.08
3,Retail,532654.35,106530.87,5,0,90341.82,6,84449.08,83463.13
4,Healthcare,1795723.35,128265.95,14,0,105172.56,5,108726.8,215180.89
5,Agriculture,1365952.98,91063.53,15,0,53811.28,13,101061.48,161994.86
6,Finance,2110599.82,150757.13,14,1,128977.21,9,128365.64,197866.88
7,Retail,737145.36,105306.48,7,1,92689.45,11,85889.8,58684.51
8,Manufacturing,1085979.91,135747.49,8,1,115864.24,10,148110.74,139773.56
9,Finance,926073.04,154345.51,6,1,129709.37,10,106956.84,80326.9


In [60]:
# Example new data
new_data = pd.DataFrame([{
    'industry_ID': 'Finance',
    'totalin': 2110599.82,
    'avgin': 150757.13,
    'noofin': 14,
    'avgout': 128977.21,
    'noout': 9,
    'oldbal': 128365.64,
    'newbal': 197866.88
}])


In [61]:
# Predict class
prediction = model.predict(new_data)

# Predict probability (optional, for classification only)
proba = model.predict_proba(new_data)

print("Predicted Class:", prediction[0])
print("Probability of Classes:", proba[0])


Predicted Class: 1
Probability of Classes: [1.1241436e-04 9.9988759e-01]


In [62]:
import pandas as pd

# Sample batch data (same structure as training features)
data = {
    'industry_ID': ['Healthcare', 'Retail', 'Agriculture', 'Manufacturing', 'IT'],
    'totalin': [150000, 180000, 120000, 250000, 300000],
    'avgin': [12500, 15000, 10000, 20833, 37500],
    'noofin': [12, 12, 12, 12, 8],
    'avgout': [8000, 9500, 7000, 12000, 14000],
    'noout': [10, 9, 8, 11, 10],
    'oldbal': [110000, 98000, 105000, 130000, 125000],
    'newbal': [115000, 100000, 108000, 140000, 130000]
}

# Create a DataFrame
batch_df = pd.DataFrame(data)

# Save it as CSV
batch_df.to_csv('/content/new_customers.csv', index=False)

print("Sample CSV file created as 'new_customers.csv'")


Sample CSV file created as 'new_customers.csv'


In [65]:
import joblib

# Load model
model = joblib.load('/content/best_model.pkl')

# Load new data
new_customers = pd.read_csv('/content/new_customers.csv')

# Predict
predictions = model.predict(new_customers)
print("Predictions:", predictions)


Predictions: [0 0 0 0 0]


In [66]:
accuracy=accuracy_score(y_test,y_pred)
print(accuracy)

0.99968
