# Importing necessary libraries

In [1]:
import sklearn
print(sklearn.__version__)


1.7.1


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

# Load Dataset

In [3]:
df = pd.read_csv("digital_wallet.csv")
print("Original shape:", df.shape)
df.head()

Original shape: (7000, 20)


Unnamed: 0,Customer_ID,Age,Location,Income_Level,Total_Transactions,Avg_Transaction_Value,Max_Transaction_Value,Min_Transaction_Value,Total_Spent,Active_Days,Last_Transaction_Days_Ago,Loyalty_Points_Earned,Referral_Count,Cashback_Received,App_Usage_Frequency,Preferred_Payment_Method,Support_Tickets_Raised,Issue_Resolution_Time,Customer_Satisfaction_Score,LTV
0,cust_0000,54,Urban,Low,192,16736.384023,60216.83451,6525.814861,3213386.0,140,209,2114,25,2224.01214,Monthly,Debit Card,3,61.56859,1,327954.6
1,cust_0001,67,Suburban,High,979,14536.734683,48350.100272,2186.742245,14231460.0,229,240,2960,20,4026.823518,Monthly,UPI,17,60.392889,8,1437053.0
2,cust_0002,44,Urban,High,329,7061.3728,32521.157187,2743.406808,2323192.0,73,21,3170,0,1441.011395,Monthly,Debit Card,11,45.305579,4,241938.7
3,cust_0003,30,Rural,High,71,16426.876453,17827.89672,4360.784994,1166308.0,299,285,4756,35,4365.85558,Weekly,Wallet Balance,6,22.030191,1,128459.9
4,cust_0004,58,Urban,Middle,878,10800.09266,17497.634534,4532.87252,9482481.0,236,329,1992,18,4161.523827,Daily,UPI,18,20.634723,5,956951.4


# Data Preprocessing and Feature Engineering

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Customer_ID                  7000 non-null   object 
 1   Age                          7000 non-null   int64  
 2   Location                     7000 non-null   object 
 3   Income_Level                 7000 non-null   object 
 4   Total_Transactions           7000 non-null   int64  
 5   Avg_Transaction_Value        7000 non-null   float64
 6   Max_Transaction_Value        7000 non-null   float64
 7   Min_Transaction_Value        7000 non-null   float64
 8   Total_Spent                  7000 non-null   float64
 9   Active_Days                  7000 non-null   int64  
 10  Last_Transaction_Days_Ago    7000 non-null   int64  
 11  Loyalty_Points_Earned        7000 non-null   int64  
 12  Referral_Count               7000 non-null   int64  
 13  Cashback_Received 

In [5]:
print("\nMissing values:\n", df.isnull().sum())


Missing values:
 Customer_ID                    0
Age                            0
Location                       0
Income_Level                   0
Total_Transactions             0
Avg_Transaction_Value          0
Max_Transaction_Value          0
Min_Transaction_Value          0
Total_Spent                    0
Active_Days                    0
Last_Transaction_Days_Ago      0
Loyalty_Points_Earned          0
Referral_Count                 0
Cashback_Received              0
App_Usage_Frequency            0
Preferred_Payment_Method       0
Support_Tickets_Raised         0
Issue_Resolution_Time          0
Customer_Satisfaction_Score    0
LTV                            0
dtype: int64


In [6]:
# Features Required for training
important_features = [
    'Age', 'Income_Level', 'Total_Transactions', 'Avg_Transaction_Value',
    'Total_Spent', 'Active_Days', 'Last_Transaction_Days_Ago',
    'Customer_Satisfaction_Score', 'Preferred_Payment_Method', 'LTV'
]
df = df[important_features]

In [7]:
# Drop duplicates
df = df.drop_duplicates()

In [8]:
# Encode column 'Income_Level'
income_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df['Income_Level'] = df['Income_Level'].map(income_mapping)


In [9]:
# Encode 'Preferred_Payment_Method' using one-hot encoding
df = pd.get_dummies(df, columns=['Preferred_Payment_Method'])

In [10]:
# Drop any rows with NaNs
df = df.dropna()

# Selecting Features and Target Variable

In [11]:
X = df.drop("LTV", axis=1)
y = df["LTV"]

In [12]:
# Scaling numeric columns
numeric_cols = ['Age', 'Total_Transactions', 'Avg_Transaction_Value', 'Total_Spent', 'Active_Days', 'Last_Transaction_Days_Ago']
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])


joblib.dump(scaler, "MyScaler")

['MyScaler']

# Train -Test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training size: {X_train.shape[0]}")
print(f"Testing size: {X_test.shape[0]}")

Training size: 3687
Testing size: 922


# Model Training 

In [14]:
model = RandomForestRegressor(n_estimators=200,max_depth=10, random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# Evaluation

In [15]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
accuracy = 100 - mape

print("\n Model Performance:")
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape:.2f}%")
print(f"Approx. Accuracy: {accuracy:.2f}%")


 Model Performance:
R² Score: 0.9999
RMSE: 4116.51
MAPE: 2.43%
Approx. Accuracy: 97.57%


# Model dump

In [16]:
# Save model & features
joblib.dump(model, "MY RanForest Model")
joblib.dump(X.columns.tolist(), "MY FeatureCol")


['MY FeatureCol']