In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
ds = tfds.load('movielens/1m-ratings', split='train', shuffle_files=True)

In [None]:
import pandas as pd
df = tfds.as_dataframe(ds)

In [None]:
df = df.drop(['user_occupation_text', 'user_zip_code','movie_title'], axis=1)
df.head()

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,timestamp,user_gender,user_id,user_occupation_label,user_rating
0,35.0,"[1, 3, 14]",b'362',965278460,True,b'4285',11,4.0
1,45.0,[16],b'2762',974584897,True,b'2262',18,4.0
2,25.0,"[1, 18]",b'1262',974744327,True,b'1545',21,3.0
3,45.0,[7],b'982',1028758011,True,b'2247',1,5.0
4,25.0,"[4, 7, 12]",b'1288',974854069,False,b'1758',11,5.0


In [None]:
# Categorize user ratings as 1,0
def classify_user_rating(rating):
    if rating >= 3:
        return 1
    elif rating < 3:
        return 0
    # else:
    #     return 1

# Adds a categorized user rating column to the dataset
df['classified_rating'] = df['user_rating'].apply(classify_user_rating)
df = df.drop(['user_rating'],axis=1)
df.head()

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,timestamp,user_gender,user_id,user_occupation_label,classified_rating
0,35.0,"[1, 3, 14]",b'362',965278460,True,b'4285',11,1
1,45.0,[16],b'2762',974584897,True,b'2262',18,1
2,25.0,"[1, 18]",b'1262',974744327,True,b'1545',21,1
3,45.0,[7],b'982',1028758011,True,b'2247',1,1
4,25.0,"[4, 7, 12]",b'1288',974854069,False,b'1758',11,1


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['movie_id'] = le.fit_transform(df['movie_id'])
df['user_id'] = le.fit_transform(df['user_id'])
df.head()

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,timestamp,user_gender,user_id,user_occupation_label,classified_rating
0,35.0,"[1, 3, 14]",2729,965278460,True,3651,11,1
1,45.0,[16],1814,974584897,True,1404,18,1
2,25.0,"[1, 18]",263,974744327,True,607,21,1
3,45.0,[7],3689,1028758011,True,1387,1,1
4,25.0,"[4, 7, 12]",291,974854069,False,843,11,1


In [None]:
value_counts = df['classified_rating'].value_counts()
value_counts = value_counts/sum(value_counts)
value_counts

1    0.836303
0    0.163697
Name: classified_rating, dtype: float64

In [None]:
user_movie_counts = df.groupby('user_id').size()
most_watched_user_id = user_movie_counts.idxmax()
most_watched_user_movie_count = user_movie_counts.max()

print(f"The user_id with the most watched movies is: {most_watched_user_id} with {most_watched_user_movie_count} movies watched.")

The user_id with the most watched movies is: 3522 with 2314 movies watched.


In [None]:
import numpy as np

class MovieLensEnvironment:
    def __init__(self, data, state_size, actions, genres, age_list, epsilon):
        self.data = data
        self.state_size = state_size
        self.num_actions = actions
        self.num_genres = genres
        self.age_list = age_list
        self.epsilon = epsilon

        self.current_state = None
        self.current_user = None
        self.av_user_movies = None
        self.current_movie = None
        self.current_action = None
    
    def reset(self):
        #Reset state beginning of a new episode
        user_movie_pairs = self.get_user_movie_pairs()
        random_pair = user_movie_pairs.sample()
        user_id = random_pair['user_id'].iloc[0]
        self.av_user_movies = random_pair['movie_id'].iloc[0]
        movie_id = random.choice(random_pair['movie_id'].iloc[0])
        state = self.get_state(user_id, movie_id)
        return state
    
    def get_user_movie_pairs(self):
        #User-movie pair creation
        user_movie_pairs = self.data.groupby('user_id')['movie_id'].apply(list).reset_index()
        return user_movie_pairs
    
    def get_user_info(self, user_id):
        #to get user info
        user_info = self.data.loc[self.data['user_id'] == user_id, ['user_gender', 'bucketized_user_age']]
        user_info = user_info.iloc[0].values.tolist()
        return user_info
    
    def get_movie_info(self, movie_id):
        #to get movie info (genres)
        movies = self.data.loc[self.data['movie_id'] == movie_id, 'movie_genres']
        movie_info = movies.iloc[0]
        return movie_info
    
    def get_state(self, user_id, movie_id):
        #to return a valid state where movie and user info used together
        user_info = self.get_user_info(user_id)
        movie_info = self.get_movie_info(movie_id)
        bucket_age = user_info[1]
        user_age = [0] * len(self.age_list)

        for i in range(0,len(self.age_list)):
            if self.age_list[i] == bucket_age:
                user_age[i] = 1

        movie_features = [0] * (self.num_genres)
        for genre in movie_info:
            movie_features[genre] = 1
        
        state = [int(user_info[0])] + user_age + movie_features
        self.current_state = state
        self.current_user = user_id
        self.current_movie = movie_id
        return state
    
    def get_reward(self, user_id, movie_id, action):
        #to calculate reward of action
        user_rating = self.data.loc[(self.data['user_id'] == user_id) & (self.data['movie_id'] == movie_id), 'classified_rating']
        user_rating = user_rating.iloc[0]
        self.current_action = action[0]
        reward = 0

        #epsilon-greedy approach
        if np.random.random() < self.epsilon:  #pick a random action
            selected_action = np.random.choice(self.num_actions)
        else:  #select highest probability action
            selected_action = tf.argmax(action, axis=1)

        if user_rating != selected_action:
            reward = -value_counts[selected_action]

        #print(action, user_rating, selected_action, reward)
        return float(reward)

    def step(self, action):
        #step action either same user with different movie or stopping state where to use to pass new episode
        reward = self.get_reward(self.current_user, self.current_movie, action)    
        self.av_user_movies.remove(self.current_movie)  # Current movie removal from user's available movies list.
        done = False
        
        if not self.av_user_movies:  #if all movies are watched
            done = True
            next_state = [0] * self.state_size  #End state
        else:
            next_movie = np.random.choice(self.av_user_movies)
            next_state = self.get_state(self.current_user, next_movie)
        
        return next_state, reward, done

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import random
from collections import deque

# Replay Buffer class
class ReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.buffer = deque(maxlen=self.buffer_size)

    def add(self, observation, action, reward, next_observation):
        experience = (observation, action, reward, next_observation)
        self.buffer.append(experience)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        observations, actions, rewards, next_observations = zip(*batch)
        return observations, actions, rewards, next_observations

    def __len__(self):
        return len(self.buffer)
    
#actor class to be used return action probabilities in a list size 2
class Actor(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(Actor, self).__init__()
        self.fc1 = layers.Dense(400, activation='relu')
        self.fc2 = layers.Dense(300, activation='relu')
        self.output_layer = layers.Dense(action_size, activation='softmax')
        self.action_size = action_size

    def call(self, states):
        x = self.fc1(states)
        x = self.fc2(x)
        action_probabilities = self.output_layer(x)
        
        # print(action_probabilities)
        return action_probabilities

#class for calculate Q-values
class Critic(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        # State pathway
        self.s1 = layers.Dense(400, activation='relu')
        self.s2 = layers.Dense(300)  # Activation 'relu' olmayacak
        # Action pathway
        self.a1 = layers.Dense(300)  # Activation 'relu' olmayacak
        # Combining state and action pathways
        self.c1 = layers.Add()
        self.c2 = layers.Dense(300, activation='relu')
        self.output_layer = layers.Dense(1)  # Q Value çıktısı

    def call(self, inputs):
        states, actions = inputs
        # State pathway
        s = self.s1(states)
        s = self.s2(s)
        # Action pathway
        a = self.a1(actions)
        # Combining state and action pathways
        c = self.c1([s, a])
        c = self.c2(c)
        Q_values = self.output_layer(c)
        return Q_values

In [None]:
class DDPG:
    def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau):
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        # Replay Buffer create
        self.buffer = ReplayBuffer(buffer_size)

        # actor and critic models create
        self.actor = Actor(state_size, action_size)
        self.target_actor = Actor(state_size, action_size)
        self.critic = Critic(state_size, action_size)
        self.target_critic = Critic(state_size, action_size)

        #set weights equals with target models
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

        #define optimizers
        self.actor_optimizer = tf.keras.optimizers.AdamW(learning_rate=1e-4)
        self.critic_optimizer = tf.keras.optimizers.AdamW(learning_rate=5e-4)

    def select_action(self, state):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        action = self.actor(state)
        return action

    def train(self):
        observations, actions, rewards, next_observations = self.buffer.sample(self.batch_size)
        observations = tf.convert_to_tensor(observations, dtype=tf.float32)
        actions = tf.convert_to_tensor(actions, dtype=tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
        next_observations = tf.convert_to_tensor(next_observations, dtype=tf.float32)

        #update actor and critic models
        with tf.GradientTape() as tape:
            next_actions = self.target_actor(next_observations)
            q_values = self.target_critic([next_observations, next_actions])
            target_q_values = rewards + self.gamma * q_values
            critic_values = self.critic([observations, actions])
            critic_loss = tf.reduce_mean(tf.square(target_q_values - critic_values))

        critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            new_actions = self.actor(observations)
            actor_loss = -tf.reduce_mean(self.critic([observations, new_actions]))

        actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))

        #update target models
        self.update_target_networks()

    def update_target_networks(self):
        actor_weights = self.actor.get_weights()
        target_actor_weights = self.target_actor.get_weights()
        critic_weights = self.critic.get_weights()
        target_critic_weights = self.target_critic.get_weights()

        for i in range(len(actor_weights)):
            target_actor_weights[i] = self.tau * actor_weights[i] + (1 - self.tau) * target_actor_weights[i]

        for i in range(len(critic_weights)):
            target_critic_weights[i] = self.tau * critic_weights[i] + (1 - self.tau) * target_critic_weights[i]

        self.target_actor.set_weights(target_actor_weights)
        self.target_critic.set_weights(target_critic_weights)

In [None]:
num_movie_genres = max(df['movie_genres'].explode().unique().tolist()) + 1
age_list = df['bucketized_user_age'].unique().tolist()

state_size =  num_movie_genres + len(age_list) + 1
action_size = 2 #3
buffer_size = 1000000
batch_size = 512
soft_tau = 1e-3
gamma = 0.90
epsilon = 0.20
num_episodes = 24000
num_steps = most_watched_user_movie_count #max value of movie watched count
user_rewards = {}
reward_per100 = []
reward_per = []

In [None]:
def train_ddpg(env, ddpg, num_episodes, reward_per, reward_per100):

    #each episode to train DDPG
    for episode in range(num_episodes):
        state = env.reset()
        episode_reward = 0
        env.epsilon * 0.99987 #after 24000 episodes, converges 0.01.

        for step in range(num_steps+1):
            action = ddpg.select_action(state)
            next_state, reward, done = env.step(action)
            ddpg.buffer.add(state, action, reward, next_state)

            state = next_state
            episode_reward += reward
        
            if done:
                #End of the episode, train model
                if len(ddpg.buffer) > batch_size:
                    ddpg.train()
            
                reward_per.append(episode_reward / (step+1))
                if episode % 100 == 0 and episode != 0: 
                    reward_per100.append(sum(reward_per)/100)
                    reward_per = []
                    print("Average reward of latest 100 episode: {}".format(reward_per100[len(reward_per100)-1]))
                    
                print("Last action of the Episode {}, {}:".format(episode, env.current_action))
                
                #to check progress for a specific user
                if env.current_user not in user_rewards:
                    user_rewards[env.current_user] = []
                user_rewards[env.current_user].append(episode_reward)
                break

        print("Episode: {}, User: {}, Rewards so far: {}".format(episode, env.current_user, user_rewards[env.current_user]))

In [None]:
ddpg = DDPG(state_size, action_size, buffer_size, batch_size, gamma, soft_tau)
env = MovieLensEnvironment(df, state_size, action_size, num_movie_genres, age_list, epsilon)

train_ddpg(env,ddpg,num_episodes,reward_per,reward_per100)

Last action of the Episode 0, [0.5382759  0.46172413]:
Episode: 0, User: 4860, Rewards so far: [-3.7828453853144706]
Last action of the Episode 1, [0.5106057 0.4893943]:
Episode: 1, User: 2471, Rewards so far: [-3.1458775116000757]
Last action of the Episode 2, [0.5033896  0.49661043]:
Episode: 2, User: 2220, Rewards so far: [-4.583510046400302]




[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
Last action of the Episode 21512, [0.7455634  0.25443667]:
Episode: 21512, User: 5179, Rewards so far: [-3.7650261095431063, -3.94654217268591, -2.9643614484572725, -4.782845385314469, -1.9643614484572727, -2.309574298971515, -3.1102389600573486, -3.78284538531447]
Last action of the Episode 21513, [0.08265658 0.9173434 ]:
Episode: 21513, User: 3168, Rewards so far: [-49.30837155034623, -48.3261908261176, -25.789491996172764, -30.971008059315565, -12.747206833771733]
Last action of the Episode 21514, [0.4946976  0.50530237]:
Episode: 21514, User: 2610, Rewards so far: [-7.857445793829082, -5.565690770628936, -5.238297195886058, -4.91090362114318, -4.256116471657424, -1.1458775116000754]
Last action of the Episode 21515, [0.7484748  0.25152525]:
Episode: 21515, User: 5045, Rewards so far: [-4.946542172685907, -4.8006646610858335, -3.9643614484572733]
Last action of the Episode 21516, [0.49117333 0.5088267 ]:
Episode: 21516, Us

In [None]:
import matplotlib.pyplot as plt

x_values = range(len(reward_per100))

plt.figure(figsize=(10, 6))
plt.plot(x_values, reward_per100)
plt.xlabel('Index')
plt.ylabel('Reward')
plt.title('Average Reward for each 100 Episodes')
plt.grid(True)
plt.show()

In [None]:
# Episode: 23899, User: 1376, Rewards so far: 
user_list = [-62.076720965318756, -58.16581734417548, -61.93084345371868, -24.00332330542915, -23.05678113274325, -24.330716880172027, -28.312897604400657, -28.60465262760081]

x_values = range(len(user_list))

plt.figure(figsize=(10, 6))
plt.plot(x_values, user_list)
plt.xlabel('Index')
plt.ylabel('Reward')
plt.title('User 1376 Reward During Training')
plt.grid(True)
plt.show()

In [None]:
# Episode: 23766, User: 1608, Rewards so far: 
user_list = [-52.34401010188897, -51.88855829131737, -53.54334544080312, -33.30957429897151, -33.419813259028864]

x_values = range(len(user_list))

plt.figure(figsize=(10, 6))
plt.plot(x_values, user_list)
plt.xlabel('Index')
plt.ylabel('Reward')
plt.title('User 1608 Reward During Training')
plt.grid(True)
plt.show()

In [None]:
# Episode: 23676, User: 5676, Rewards so far: 
user_list = [-17.351859461372545, -15.714891587658155, -11.949865478115058, -17.679253036115423, -10.640291179143546, -7.693749006457643, -11.295078328629302, -3.601329322171667]

x_values = range(len(user_list))

plt.figure(figsize=(10, 6))
plt.plot(x_values, user_list)
plt.xlabel('Index')
plt.ylabel('Reward')
plt.title('User 5676 Reward During Training')
plt.grid(True)
plt.show()