In [None]:
import numpy as np
import pandas as pd

# Set a random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 1000

# Generate synthetic features with more realistic values
# Age between 18 and 70
age = np.random.randint(18, 71, size=n_samples)

# Total spend between $10 and $2000
total_spend = np.random.uniform(10, 2000, size=n_samples)

# Frequency between 1 and 30 (number of interactions or purchases per month)
frequency = np.random.randint(1, 31, size=n_samples)

# Plan type: random selection between 'Basic', 'Standard', 'Premium'
plan_type = np.random.choice(['Basic', 'Standard', 'Premium'], size=n_samples)

# Create a target column (Subscriber vs Customer)
# 'Subscriber' if total_spend > 500 and frequency > 15
# Otherwise, 'Customer'
user_class = np.where((total_spend > 500) & (frequency > 15), 'Subscriber', 'Customer')

# Combine into a DataFrame
data = pd.DataFrame({
    'age': age,
    'total_spend': total_spend,
    'frequency': frequency,
    'plan_type': plan_type,
    'user_class': user_class
})

# Save the synthetic dataset to a CSV file
data.to_csv('realistic_user_classification.csv', index=False)

# Display the first few rows
print(data.head())


   age  total_spend  frequency plan_type  user_class
0   56  1077.815386         22  Standard  Subscriber
1   69  1848.843101         12     Basic    Customer
2   46   479.871901         26   Premium    Customer
3   32  1522.311264          4   Premium    Customer
4   60  1067.218850          5  Standard    Customer


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler,LabelEncoder

# Load data
data = pd.read_csv('realistic_user_classification.csv')  # Replace with your dataset
le = LabelEncoder()
data['user_class'] = le.fit_transform(data['user_class'])

# Feature engineering (example)
# Assume 'age', 'total_spend', 'frequency', and 'plan_type' are features
data['log_spend'] = data['total_spend'].apply(lambda x: np.log(x + 1))

# Define features and target
X = data[['age', 'log_spend', 'frequency', 'plan_type']]  # Example features
y = data['user_class']  # Target: 'Subscriber' vs 'Customer'

# Convert categorical features to numeric (if any)
X = pd.get_dummies(X, drop_first=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features (optional for some models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Choose a classifier (Logistic Regression, Random Forest, or XGBoost)
# Logistic Regression example
model = LogisticRegression()
# model = RandomForestClassifier(n_estimators=100)  # Uncomment for Random Forest
# model = XGBClassifier()  # Uncomment for XGBoost

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.9200
Precision: 0.9146
Recall: 0.8929
F1 Score: 0.9036
