In [78]:
def format_add(format_list, input_data, target_data):
    ### Constructs a dictionary using input_data and 
    ### the format_list as the dictionary format and 
    ### appends the dictionary to the target data.
    
    appender_dict = {}
    
    for count, item in enumerate(format_list):
        try:
            appender_dict.update({item: input_data[count]})
        except IndexError:
            appender_dict.update({item: None})
            
    target_data.append(appender_dict)

def build_file(format_list, file_path, delimiter):
    ### Reads a file and converts the delimiter separated data 
    ### in the file, based on the format provided by format_list, 
    ### into a dictionary list.
    
    new_list = []
    
    with open(file_path) as file:
        print(f"Unpacking: '{file_path}'")
    
        for line in file:
            stripped_line = line.strip().split(delimiter)
            format_add(format_list, stripped_line, new_list)
        
    print("Unpacked succesfully")
    
    return new_list

In [79]:
file_path_users = "./ml-1m/users.dat"
file_path_movies = "./ml-1m/movies.dat"
file_path_ratings = "./ml-1m/ratings.dat"

user_data = build_file(
    ["uid", "gender", "age", "occupation", "zip-code"],
    file_path_users,
    "::"
)

movie_data = build_file(
    ["mid", "title", "genre"],
    file_path_movies,
    "::"
)

rating_data = build_file(
    ["uid", "mid", "rating", "timestamp"],
    file_path_ratings,
    "::"
)

Unpacking: './ml-1m/users.dat'
Unpacked succesfully
Unpacking: './ml-1m/movies.dat'
Unpacked succesfully
Unpacking: './ml-1m/ratings.dat'
Unpacked succesfully


In [80]:
from numpy import mean

# Calculate the mean over a comprehenced list based 
# on the rating fields in rating_data to get a global mean. 
all_rating_mean = mean([int(item["rating"]) for item in rating_data])

In [81]:
from numpy import mean, isnan

def bind_mean(bind_data_source, bind_data_target, target_key, target_mean, default=None):
    ### Creates a dictionary of target_keys with their respective means. 
    ### If the target key does not exist in the source_data, then a default value will be set.
    return {l_obj[target_key]: x if not isnan(x := mean([int(item[target_mean]) for item in filter(lambda _r : _r[target_key] == l_obj[target_key], bind_data_source)])) else default for l_obj in bind_data_target}

In [82]:
# Generate a dictionary of user_ids with the corresponding means of ratings that a user has given. 
# If the user does not exist in the ratings file, the global average rating is filled in instead.
user_rating_mean = bind_mean(rating_data, user_data, "uid", "rating", all_rating_mean)

# Generate a dictionary of movie_ids with the corresponding means of ratings that a movie has received. 
# If the movie does not exist in the ratings file, the global average rating is filled in instead.
movie_rating_mean = bind_mean(rating_data, movie_data, "mid", "rating", all_rating_mean)