In [None]:
!pip install ale-py
!pip install numpy

In [None]:
import itertools
import numpy as np

from collections import defaultdict

# To Load ROMS

Get a rom from the bellow link, extract it and then run the `ale-import-roms .` command in the folder with the roms(*implicit directory - project directory*)

In [None]:
# Supported Games: https://github.com/mgbellemare/Arcade-Learning-Environment/blob/master/docs/games.md
from ale_py.roms import Alien
from ale_py import ALEInterface, SDL_SUPPORT

ale = ALEInterface()
# Check to see if we can use UI
print(SDL_SUPPORT)

In [None]:
# Get & Set the desired settings
ale.setInt("random_seed", 123)
# The default is already 0.25, this is just an example
ale.setFloat("repeat_action_probability", 0.25)

# Check if we can display the screen
# For the first set of training better let it without UI/sound
if SDL_SUPPORT:
    ale.setBool("sound", True)
    ale.setBool("display_screen", True)

In [None]:
# Load our game
ale.loadROM(Alien)

In [None]:
def create_epsilon_greedy_policy(Q, epsilon, num_actions):
	"""
	Creates an epsilon-greedy policy based
	on a given Q-function and epsilon.
	
	Returns a function that takes the state
	as an input and returns the probabilities
	for each action in the form of a numpy array
	of length of the action space(set of possible actions).
	"""
	def policy_function(state):

		action_probabilities = np.ones(num_actions,
				dtype = float) * epsilon / num_actions
				
		best_action = np.argmax(Q[state])
		action_probabilities[best_action] += (1.0 - epsilon)
		return action_probabilities

	return policy_function


In [None]:
def from_array_to_hash(x):
    return hash(x.tostring())

In [None]:
def q_learning(env, num_episodes, discount_factor = 1.0,
			   alpha = 0.6, epsilon = 0.1):
	"""
	Q-Learning algorithm: Off-policy TD control.
	Finds the optimal greedy policy while improving
	following an epsilon-greedy policy"""
	
	# Action value function
	# A nested dictionary that maps
	# state -> (action -> action-value).
	
	legal_actions = env.getLegalActionSet()
	num_actions = len(legal_actions)
	print(num_actions)
	Q = defaultdict(lambda: np.zeros(num_actions))	
	
	# Create an epsilon greedy policy function
	# appropriately for environment action space
	policy = create_epsilon_greedy_policy(Q, epsilon, num_actions)
	
	# For every episode
	for _ in range(num_episodes):
		
		# Reset the environment and pick the first action
		env.reset_game()

		state = from_array_to_hash(env.getScreen())
		print(len(env.getScreen()))
		print(len(env.getScreen()[0]))
		
		for _ in itertools.count():
			
			# get probabilities of all actions from current state
			action_probabilities = policy(state)

			# choose action according to
			# the probability distribution
			action = np.random.choice(np.arange(
					len(action_probabilities)),
					p=action_probabilities)

			# take action and get reward, transit to next state
			reward = env.act(action)
			done = env.game_over()
			next_state = from_array_to_hash(env.getScreen())
			
			# TD Update
			best_next_action = np.argmax(Q[next_state])	
			td_target = reward + discount_factor * Q[next_state][best_next_action]
			td_delta = td_target - Q[state][action]
			Q[state][action] += alpha * td_delta

			# done is True if episode terminated
			if done:
				break
				
			state = next_state
	
	return Q


In [None]:
avail_modes = ale.getAvailableModes()
avail_diff = ale.getAvailableDifficulties()

print(f"Number of available modes: {len(avail_modes)}")
print(f"Number of available difficulties: {len(avail_diff)}")

# Get the list of legal actions
leg_act = ale.getLegalActionSet()

# NUMBER OF GAMES = no. of MODES x no. of DIFFICULTY levels
for mode in avail_modes:
    for diff in avail_diff:

        ale.setDifficulty(diff)
        ale.setMode(mode)
        ale.reset_game()
        print(f"Mode {mode} difficulty {diff}:")
        q_learning(ale, 1000)
		

18*210*160*5/(100*60*60) - acesta este un calcul al complexitatii pe care l-am si il voi intreba pe Alex data viitoare