# A. clean ml-25m dataset

## 1. get changed and deleted ids

In [1]:
import csv

In [2]:
file_path = 'changed-imdb-ids.csv'
changed_ids = {}
with open(file_path, 'r', newline='') as csv_file:
    csv_reader = csv.reader(csv_file)
    next(csv_reader)
    for row in csv_reader:
        changed_ids[str(row[0])] = str(row[1])

file_path = 'deleted-imdb-ids.csv'
deleted_ids = {}
with open(file_path, 'r', newline='') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        deleted_ids[row[0]] = ''  # will be filled by movieId

## 2. clean links.csv

In [3]:
file_path = '../ml-25m/links.csv'
with open(file_path, 'r', newline='') as csv_file:
    csv_reader = csv.reader(csv_file)
    headers = next(csv_reader)
    clean_rows = [headers]
    for row in csv_reader:
        if row[1] in deleted_ids.keys():
            deleted_ids[row[1]] = row[0]
        elif row[1] in changed_ids.keys():
            clean_row = [row[0], changed_ids[row[1]], row[2]]
            clean_rows.append(clean_row)
        else:
            clean_rows.append(row)

file_path = 'links.csv'
with open(file_path, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(clean_rows)

## 3. clean ratings.csv

In [4]:
file_path = '../ml-25m/ratings.csv'
with open(file_path, 'r', newline='') as csv_file:
    csv_reader = csv.reader(csv_file)
    deleted_movie_ids = deleted_ids.values()
    headers = next(csv_reader)
    clean_rows = [headers]
    for row in csv_reader:
        if row[1] not in deleted_movie_ids:
            clean_rows.append(row)

file_path = 'ratings.csv'
with open(file_path, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(clean_rows)

# B. find user-movie interactions

In [5]:
import csv

In [6]:
links = {}
file_path = 'links.csv'
with open(file_path, 'r', newline='') as csv_file:
    csv_reader = csv.reader(csv_file)
    next(csv_reader)
    for row in csv_reader:
        links[row[0]] = row[1]  # movieId: imdbId

In [7]:
ratings = {}
file_path = 'ratings.csv'
with open(file_path, 'r', newline='') as csv_file:
    csv_reader = csv.reader(csv_file)
    next(csv_reader)
    for row in csv_reader:
        user_id, imdb_id, weight = row[0], links.get(row[1]), row[2]
        rate = imdb_id + "#" + weight
        if ratings.get(user_id):
            ratings[user_id].append(rate)
        else:
            ratings[user_id] = [rate]

In [8]:
interactions = [['userId', 'interactions(imdbId#weight)']]
for user_id, movies in ratings.items():
    movies_string = ",".join(movies)
    interactions.append([user_id, movies_string])

file_path = 'interactions.csv'
with open(file_path, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(interactions)

## C. count user-movie interactions

In [9]:
import csv

In [10]:
links = {}
file_path = 'links.csv'
with open(file_path, 'r', newline='') as csv_file:
    csv_reader = csv.reader(csv_file)
    next(csv_reader)
    for row in csv_reader:
        links[row[0]] = row[1]  # movieId: imdbId

In [11]:
imdb_ids = sorted(list(set(links.values())), key=lambda x: int(x))

In [12]:
interaction_counts = [['imdbId', 'interactionCount']]
interaction_counts.extend([imdb_id, 0] for imdb_id in imdb_ids)

file_path = 'ratings.csv'
with open(file_path, 'r', newline='') as csv_file:
    csv_reader = csv.reader(csv_file)
    next(csv_reader)
    non_zero_imdb_ids = sorted([links.get(row[1]) for row in csv_reader])

imdb_id, count = non_zero_imdb_ids[0], 1

for i in range(1, len(non_zero_imdb_ids)):
    if non_zero_imdb_ids[i] == non_zero_imdb_ids[i - 1]:
        count += 1
    else:
        index = interaction_counts.index([imdb_id, 0])
        interaction_counts[index][1] = count
        imdb_id, count = non_zero_imdb_ids[i], 1

In [13]:
file_path = 'interaction-counts.csv'
with open(file_path, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(interaction_counts)