Making of Datasets , one for model training and another for data streaming

In [3]:
import pandas as pd
import numpy as np
import datetime

# Simulate sample data
np.random.seed(42)
num_rows = 2000
start_date = datetime.datetime(2024, 1, 1)

data = {
    'Customer_ID': range(1, num_rows + 1),
    'Age': np.random.randint(18, 70, num_rows),
    'Tenure': np.random.randint(1, 60, num_rows),  # in months
    'Monthly_Usage': np.random.uniform(10, 500, num_rows),
    'Complaints': np.random.randint(0, 5, num_rows),
    'Returns': np.random.randint(0, 3, num_rows),
    'Emails_Opened': np.random.randint(0, 10, num_rows),
    'Daily_Logins': np.random.randint(0, 5, num_rows),
    'Sensor_Triggers': np.random.randint(0, 10, num_rows),
    'Event_Timestamp': [start_date + datetime.timedelta(days=np.random.randint(0, 180)) for _ in range(num_rows)],
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Simulate churn with correlations
df['Churn'] = df.apply(lambda row: 
    1 if (row['Complaints'] > 3) or (row['Monthly_Usage'] < 50 and np.random.rand() > 0.5) else 0, axis=1)

# Introduce missing values
for col in ['Age', 'Monthly_Usage', 'Emails_Opened']:
    df.loc[df.sample(frac=0.05).index, col] = np.nan

# Save main customer dataset
df.to_csv('customer_data_large.csv', index=False)
print("Sample Data:\n", df.head())




Sample Data:
    Customer_ID   Age  Tenure  Monthly_Usage  Complaints  Returns  \
0            1  56.0      29      69.302250           3        0   
1            2  69.0      44     215.284417           4        2   
2            3  46.0      53     378.077243           1        2   
3            4  32.0      24      44.793756           0        0   
4            5  60.0      58      49.288207           3        1   

   Emails_Opened  Daily_Logins  Sensor_Triggers Event_Timestamp  Churn  
0            9.0             2                9      2024-01-25      0  
1            0.0             1                2      2024-05-18      1  
2            7.0             1                3      2024-01-17      0  
3            3.0             3                0      2024-02-05      0  
4            8.0             2                3      2024-04-05      0  

Sample Event Stream Data:
    Customer_ID Event_Timestamp      Event_Type  Event_Value
0          812      2024-02-27           login     

Making a Logistic regression model

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load the pre-generated customer data CSV
df = pd.read_csv('customer_data_large.csv')

In [7]:
# Check the first few rows to verify the data
print(df.head())

   Customer_ID   Age  Tenure  Monthly_Usage  Complaints  Returns  \
0            1  56.0      29      69.302250           3        0   
1            2  69.0      44     215.284417           4        2   
2            3  46.0      53     378.077243           1        2   
3            4  32.0      24      44.793756           0        0   
4            5  60.0      58      49.288207           3        1   

   Emails_Opened  Daily_Logins  Sensor_Triggers Event_Timestamp  Churn  
0            9.0             2                9      25-01-2024      0  
1            0.0             1                2      18-05-2024      1  
2            7.0             1                3      17-01-2024      0  
3            3.0             3                0      05-02-2024      0  
4            8.0             2                3      05-04-2024      0  


In [9]:
# Preprocessing: Handling missing values with mean imputation
imputer = SimpleImputer(strategy='mean')
df[['Age', 'Monthly_Usage', 'Emails_Opened']] = imputer.fit_transform(df[['Age', 'Monthly_Usage', 'Emails_Opened']])

# Feature Selection: Drop non-numeric or irrelevant features (like Customer_ID and Event_Timestamp)
df = df.drop(columns=['Customer_ID', 'Event_Timestamp'])

In [11]:
# Feature Scaling: Standardize numeric features
scaler = StandardScaler()
df[['Age', 'Tenure', 'Monthly_Usage', 'Complaints', 'Returns', 'Emails_Opened', 'Daily_Logins', 'Sensor_Triggers']] = scaler.fit_transform(
    df[['Age', 'Tenure', 'Monthly_Usage', 'Complaints', 'Returns', 'Emails_Opened', 'Daily_Logins', 'Sensor_Triggers']]
)

In [13]:
# Split the data into train and test sets (80% training, 20% testing)
X = df.drop(columns='Churn')  # Features
y = df['Churn']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Model Training: Logistic Regression
model = LogisticRegression(max_iter=1000)

In [17]:
history=model.fit(X_train, y_train)


In [19]:
# Model Evaluation: Predictions and Performance Metrics
y_pred = model.predict(X_test)

In [21]:
# Print out classification metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95       305
           1       0.85      0.85      0.85        95

    accuracy                           0.93       400
   macro avg       0.90      0.90      0.90       400
weighted avg       0.93      0.93      0.93       400

Confusion Matrix:
 [[291  14]
 [ 14  81]]


In [23]:
# Save the model 
joblib.dump(model, 'models/churn_prediction_model.pkl')

['models/churn_prediction_model.pkl']