In [1]:
import numpy as np
import pandas as pd

#set a random seed
np.random.seed(123)

#number of customers
n_customers = 1000

#create a dataframe

data = pd.DataFrame({
    'customer_id': np.arange(n_customers),
    'average_mileage_per_rental': np.random.normal(100, 10, n_customers),
    'total_maintenance_cost': np.random.normal(500, 100, n_customers),
    'average_rental_duration': np.random.normal(14, 2, n_customers),
    'late_return': np.random.choice([0, 1], n_customers, p=[0.7, 0.3]),
})

# create a binary 'at-risk' column. Assume that customers who often return cars late and have high maintenance costs are more likely to be at risk
data['at_risk'] = ((data['late_return'] == 1) & (data['total_maintenance_cost'] > data['total_maintenance_cost'].median())).astype(int)

data.head()

Unnamed: 0,customer_id,average_mileage_per_rental,total_maintenance_cost,average_rental_duration,late_return,at_risk
0,0,89.143694,425.117253,10.451553,1,0
1,1,109.973454,556.759473,11.597245,0,0
2,2,102.829785,571.815054,16.192514,0,0
3,3,84.937053,400.061925,15.722074,1,0
4,4,94.213997,547.489832,10.959266,1,1


This code creates a DataFrame with 1000 rows, each representing a different customer. The average_mileage_per_rental, total_maintenance_cost, and average_rental_duration columns are filled with random numbers drawn from a normal distribution. The late_return column is filled with random 0s and 1s, representing whether each customer tends to return cars late. The at_risk column is 1 for customers who often return cars late and have above-median maintenance costs, and 0 otherwise.

# Create features
df['average_mileage_per_rental'] = df.groupby('customer_id')['mileage'].transform('mean')
df['total_maintenance_cost'] = df.groupby('customer_id')['maintenance_cost'].transform('sum')
df['average_rental_duration'] = df.groupby('customer_id')['rental_duration'].transform('mean')
df['late_return'] = df['return_date'] > df['due_date']

Alternative option

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X = data[['average_mileage_per_rental', 'total_maintenance_cost', 'average_rental_duration', 'late_return']]
y = data['at_risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

#build and evaluate the model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       261
           1       0.89      1.00      0.94        39

    accuracy                           0.98       300
   macro avg       0.94      0.99      0.97       300
weighted avg       0.99      0.98      0.98       300




These are excellent results! Your model has an accuracy of 0.98, which means it correctly identified 98% of the cases in your test set.

The precision for class 0 (not at-risk) is 1.00, meaning when your model predicts a customer is not at-risk, it’s correct 100% of the time. For class 1 (at-risk), the precision is 0.89, so when your model predicts a customer is at-risk, it’s correct 89% of the time.

The recall for class 0 is 0.98, meaning your model correctly identified 98% of the not at-risk customers. For class 1, the recall is 1.00, so it correctly identified all the at-risk customers.

The F1-score is a balanced measure of precision and recall. An F1-score reaches its best value at 1 (perfect precision and recall) and worst at 0. Your model has an F1-score of 0.99 for class 0 and 0.94 for class 1, which are very good scores.

The support is the number of samples of the true response that lie in that class. So, there were 261 customers not at-risk and 39 at-risk in your test set.

In [18]:
X_new = data.loc[data['customer_id'] == 1000-1, ['average_mileage_per_rental', 'total_maintenance_cost', 'average_rental_duration', 'late_return']]
y_newpred = model.predict(X_new)
print(y_newpred)

[0]
