In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [16]:
# Step 1: Load Data
# Assuming a CSV file with customer data including features and a target column 'is_high_value'
data = pd.read_csv('seed_data.csv')
print(data.head())

   age  income  purchase_history  is_high_value
0   56   54674               105              1
1   69   55854               374              0
2   46   66271               427              0
3   32   93688               304              0
4   60   58518               359              1


In [17]:
# Step 2: Preprocess Data
# Separate features and target variable
X = data.drop(columns=['is_high_value'])
y = data['is_high_value']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
# Step 3: Hyperparameter Tuning with Grid Search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters from Grid Search
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

# Train the model with the best parameters
best_model = grid_search.best_estimator_

Best parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}


In [19]:
# Step 4: Evaluate Model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.6666666666666666
Confusion Matrix:
[[188  19]
 [ 81  12]]


In [20]:
# Step 5: Score and Rank Customers
# Assuming new_customer_data is a DataFrame with the same features as X
new_customer_data = pd.read_csv('pool_data.csv')

# Predict probabilities for the new customers
look_alike_probabilities = best_model.predict_proba(new_customer_data)[:, 1]  # Probability of being high-value

# Add scores to the new customer data
new_customer_data['score'] = look_alike_probabilities

# Rank customers based on the scores
new_customer_data = new_customer_data.sort_values(by='score', ascending=False)

# Save ranked customers to a new CSV file
new_customer_data.to_csv('ranked_customers_DT.csv', index=False)

print("Ranked customer predictions saved to 'ranked_customers.csv'")

Ranked customer predictions saved to 'ranked_customers.csv'
