In [8]:
import tensorflow as tf

# Check for GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Allow memory growth for the GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

Num GPUs Available:  0


In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from collections import deque
import random
import matplotlib.pyplot as plt
import json
import os

# Load the preprocessed dataset
# Assuming the CSV is in the same directory as the script
preprocessed_data = pd.read_csv(r".\preprocessed_marketing_dataset.csv")

# Define state columns and action columns
state_columns = ['Age', 'Gender', 'Income', 'Location', 'Clicks']
action_columns = ['Ad Type', 'Ad Topic', 'Ad Placement']

# Create mappings for categorical variables
label_encoders = {}
inverse_encoders = {}

for column in preprocessed_data.columns:
    if preprocessed_data[column].dtype == 'object':
        unique_values = preprocessed_data[column].unique()
        label_encoders[column] = {value: idx for idx, value in enumerate(unique_values)}
        inverse_encoders[column] = {idx: value for idx, value in enumerate(unique_values)}

# Apply encodings to the dataset
encoded_data = preprocessed_data.copy()
for column, encoder in label_encoders.items():
    encoded_data[column] = encoded_data[column].map(encoder)

# Calculate feature ranges for normalization
feature_ranges = {}
for column in encoded_data.columns:
    if encoded_data[column].dtype != 'object':
        feature_ranges[column] = (encoded_data[column].min(), encoded_data[column].max())

# Normalize numerical features
for column, (min_val, max_val) in feature_ranges.items():
    if max_val > min_val:  # Avoid division by zero
        encoded_data[column] = (encoded_data[column] - min_val) / (max_val - min_val)

# Define improved MDP environment
class ImprovedMarketingMDP:
    def __init__(self, data):
        self.data = data
        self.current_idx = None
        self.num_users = len(data)
        
        # Get action space sizes
        self.num_ad_types = len(data['Ad Type'].unique())
        self.num_ad_topics = len(data['Ad Topic'].unique())
        self.num_ad_placements = len(data['Ad Placement'].unique())
    
    def reset(self):
        # Start with a random user profile
        self.current_idx = np.random.randint(0, self.num_users)
        current_state = self.data.iloc[self.current_idx][state_columns].values
        return current_state
    
    def step(self, ad_type_idx, ad_topic_idx, ad_placement_idx):
        # Get current user profile
        user_profile = self.data.iloc[self.current_idx]
        
        # Try to find similar examples in the dataset
        similar_examples = self.data[
            (self.data['Gender'] == user_profile['Gender']) & 
            (self.data['Location'] == user_profile['Location']) & 
            (self.data['Ad Type'] == ad_type_idx) & 
            (self.data['Ad Topic'] == ad_topic_idx) & 
            (self.data['Ad Placement'] == ad_placement_idx)
        ]
        
        # If we have similar examples, use their average conversion rate as reward
        if len(similar_examples) > 0:
            reward = similar_examples['Conversion Rate'].mean()
        else:
            # Fallback: find less specific matches
            partial_matches = self.data[
                (self.data['Ad Type'] == ad_type_idx) & 
                (self.data['Ad Topic'] == ad_topic_idx) & 
                (self.data['Ad Placement'] == ad_placement_idx)
            ]
            
            if len(partial_matches) > 0:
                reward = partial_matches['Conversion Rate'].mean()
            else:
                # Very unlikely, but just in case
                reward = self.data['Conversion Rate'].mean() * 0.5  # Penalty for unknown
        
        # Move to a new random user profile
        self.current_idx = np.random.randint(0, self.num_users)
        next_state = self.data.iloc[self.current_idx][state_columns].values
        
        # Assume episodes don't end
        done = False
        
        return next_state, reward, done
    
    def get_action_space_size(self):
        return (self.num_ad_types, self.num_ad_topics, self.num_ad_placements)

# Build multi-headed DQN model
def build_multi_head_model(state_size, num_ad_types, num_ad_topics, num_ad_placements):
    input_layer = Input(shape=(state_size,))
    
    # Shared layers
    x = Dense(128, activation='relu')(input_layer)
    x = Dense(128, activation='relu')(x)
    
    # Separate heads for each action component
    ad_type_head = Dense(64, activation='relu')(x)
    ad_type_output = Dense(num_ad_types, activation='linear', name='ad_type')(ad_type_head)
    
    ad_topic_head = Dense(64, activation='relu')(x)
    ad_topic_output = Dense(num_ad_topics, activation='linear', name='ad_topic')(ad_topic_head)
    
    ad_placement_head = Dense(64, activation='relu')(x)
    ad_placement_output = Dense(num_ad_placements, activation='linear', name='ad_placement')(ad_placement_head)
    
    model = tf.keras.Model(inputs=input_layer, outputs=[ad_type_output, ad_topic_output, ad_placement_output])
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(
        optimizer=optimizer,
        loss={
            'ad_type': 'mse',
            'ad_topic': 'mse',
            'ad_placement': 'mse'
        }
    )
    
    return model

# Initialize environment
env = ImprovedMarketingMDP(encoded_data)
action_space_size = env.get_action_space_size()
state_size = len(state_columns)

# Initialize model
model = build_multi_head_model(
    state_size, 
    action_space_size[0], 
    action_space_size[1], 
    action_space_size[2]
)

# Define training parameters
batch_size = 64
gamma = 0.95
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
memory = deque(maxlen=10000)

# Function to store experiences
def remember(state, actions, rewards, next_state, done):
    memory.append((state, actions, rewards, next_state, done))

# Training loop
episodes = 5000
rewards_history = []

for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0
    
    for time_step in range(100):  # Max time steps per episode
        # Epsilon-greedy action selection
        if np.random.rand() <= epsilon:
            # Random actions
            ad_type_action = np.random.randint(0, action_space_size[0])
            ad_topic_action = np.random.randint(0, action_space_size[1])
            ad_placement_action = np.random.randint(0, action_space_size[2])
        else:
            # Greedy actions based on Q-values
            q_ad_type, q_ad_topic, q_ad_placement = model.predict(state, verbose=0)
            ad_type_action = np.argmax(q_ad_type[0])
            ad_topic_action = np.argmax(q_ad_topic[0])
            ad_placement_action = np.argmax(q_ad_placement[0])
        
        # Take action and observe result
        next_state, reward, done = env.step(ad_type_action, ad_topic_action, ad_placement_action)
        next_state = np.reshape(next_state, [1, state_size])
        
        # Store experience
        actions = (ad_type_action, ad_topic_action, ad_placement_action)
        rewards = (reward, reward, reward)  # Same reward for all heads
        remember(state, actions, rewards, next_state, done)
        
        # Update state and accumulate reward
        state = next_state
        total_reward += reward
        
        if done:
            break
    
    # Train from experience replay
    if len(memory) >= batch_size:
        minibatch = random.sample(memory, batch_size)
        
        states = np.zeros((batch_size, state_size))
        ad_type_targets = np.zeros((batch_size, action_space_size[0]))
        ad_topic_targets = np.zeros((batch_size, action_space_size[1]))
        ad_placement_targets = np.zeros((batch_size, action_space_size[2]))
        
        # Prepare batch data
        for i, (state_i, actions_i, rewards_i, next_state_i, done_i) in enumerate(minibatch):
            # Get current Q-values
            current_q_ad_type, current_q_ad_topic, current_q_ad_placement = model.predict(state_i, verbose=0)
            
            # Get future Q-values for next state
            future_q_ad_type, future_q_ad_topic, future_q_ad_placement = model.predict(next_state_i, verbose=0)
            
            # Unpack actions and rewards
            ad_type_action, ad_topic_action, ad_placement_action = actions_i
            ad_type_reward, ad_topic_reward, ad_placement_reward = rewards_i
            
            # Update targets using Bellman equation
            ad_type_targets[i] = current_q_ad_type[0]
            ad_topic_targets[i] = current_q_ad_topic[0]
            ad_placement_targets[i] = current_q_ad_placement[0]
            
            if not done_i:
                # Apply reward plus discounted future max Q-value
                ad_type_targets[i, ad_type_action] = ad_type_reward + gamma * np.max(future_q_ad_type)
                ad_topic_targets[i, ad_topic_action] = ad_topic_reward + gamma * np.max(future_q_ad_topic)
                ad_placement_targets[i, ad_placement_action] = ad_placement_reward + gamma * np.max(future_q_ad_placement)
            else:
                # If done, just use the reward
                ad_type_targets[i, ad_type_action] = ad_type_reward
                ad_topic_targets[i, ad_topic_action] = ad_topic_reward
                ad_placement_targets[i, ad_placement_action] = ad_placement_reward
            
            # Store state
            states[i] = state_i[0]
        
        # Train the model on the batch
        model.fit(
            states, 
            [ad_type_targets, ad_topic_targets, ad_placement_targets],
            batch_size=batch_size, 
            epochs=1, 
            verbose=0
        )
    
    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay
    
    rewards_history.append(total_reward)
    
    # Progress reporting
    if episode % 1 == 0:
        avg_reward = np.mean(rewards_history[-100:])
        print(f"Episode: {episode+1}/{episodes}, Avg Reward: {avg_reward:.4f}, Epsilon: {epsilon:.4f}")

# Save the model
model.save("improved_marketing_rl_model.h5")
print("Model saved as improved_marketing_rl_model.h5")

# Save metadata for the app
metadata = {
    "label_encoders": {k: {str(key): value for key, value in v.items()} for k, v in label_encoders.items()},
    "inverse_encoders": {k: {str(key): value for key, value in v.items()} for k, v in inverse_encoders.items()},
    "feature_ranges": {k: [float(v[0]), float(v[1])] for k, v in feature_ranges.items()},
    "state_columns": state_columns,
    "action_space_size": [int(size) for size in action_space_size]
}

with open("model_metadata.json", "w") as f:
    json.dump(metadata, f)
print("Model metadata saved as model_metadata.json")

# Plot training rewards
plt.figure(figsize=(10, 6))
plt.plot(rewards_history)
plt.title("Training Rewards Over Episodes")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.savefig("training_rewards.png")
plt.show()

# Evaluate model performance
def evaluate_model(model, data, num_samples=1000):
    results = []
    
    for _ in range(num_samples):
        # Random user profile
        user_idx = np.random.randint(0, len(data))
        user = data.iloc[user_idx]
        
        # Get state features
        state = user[state_columns].values
        state = np.reshape(state, [1, state_size])
        
        # Get model predictions
        q_ad_type, q_ad_topic, q_ad_placement = model.predict(state, verbose=0)
        
        # Get best actions
        best_ad_type = np.argmax(q_ad_type[0])
        best_ad_topic = np.argmax(q_ad_topic[0])
        best_ad_placement = np.argmax(q_ad_placement[0])
        
        # Find similar examples in dataset
        similar = data[
            (data['Ad Type'] == best_ad_type) & 
            (data['Ad Topic'] == best_ad_topic) & 
            (data['Ad Placement'] == best_ad_placement)
        ]
        
        # Calculate expected reward
        if len(similar) > 0:
            expected_reward = similar['Conversion Rate'].mean()
        else:
            expected_reward = data['Conversion Rate'].mean() * 0.5
        
        # Store result
        results.append(expected_reward)
    
    # Calculate metrics
    avg_reward = np.mean(results)
    max_reward = np.max(results)
    min_reward = np.min(results)
    
    return {
        "average_reward": float(avg_reward),
        "max_reward": float(max_reward),
        "min_reward": float(min_reward),
        "samples": num_samples
    }

# Evaluate the model
evaluation = evaluate_model(model, encoded_data)
print(f"Model Evaluation: Average Reward: {evaluation['average_reward']:.4f}")

# Save evaluation results
with open("evaluation_results.json", "w") as f:
    json.dump(evaluation, f)
print("Evaluation results saved as evaluation_results.json")

Episode: 1/5000, Avg Reward: 14.8724, Epsilon: 0.9950
Episode: 2/5000, Avg Reward: 15.0420, Epsilon: 0.9900
Episode: 3/5000, Avg Reward: 14.9794, Epsilon: 0.9851
Episode: 4/5000, Avg Reward: 15.1975, Epsilon: 0.9801
Episode: 5/5000, Avg Reward: 15.0317, Epsilon: 0.9752
Episode: 6/5000, Avg Reward: 15.0288, Epsilon: 0.9704
Episode: 7/5000, Avg Reward: 15.0453, Epsilon: 0.9655
Episode: 8/5000, Avg Reward: 15.0884, Epsilon: 0.9607
Episode: 9/5000, Avg Reward: 15.1168, Epsilon: 0.9559
Episode: 10/5000, Avg Reward: 15.1174, Epsilon: 0.9511


KeyboardInterrupt: 