In [1]:
# Run this script inside the ml-25m folder

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import scipy.sparse

In [3]:
path = 'ratings.csv'
top_movies = 3500

In [4]:
# Construct rating matrix
df = pd.read_csv(path)
movie_le = LabelEncoder()
user_le = LabelEncoder()
df['userId'] = user_le.fit_transform(df['userId'])
df['movieId'] = movie_le.fit_transform(df['movieId'])

row, column, data = df['userId'], df['movieId'], np.ones(len(df))
matrix = scipy.sparse.csr_matrix((data, (row, column)))

In [5]:
# Filter movies
num_movie_inter = matrix.sum(0)
top_m = np.asarray(num_movie_inter).squeeze().argsort()[-top_movies:]
matrix_m_filtered = matrix[:, top_m]

In [6]:
# Filter users
num_user_inter = matrix_m_filtered.sum(-1)
where = np.where(num_user_inter >= 20)[0]
matrix_u_m_filtered = matrix_m_filtered[where]

In [7]:
# Save ratings
user_id, movie_id = matrix_u_m_filtered.nonzero()
df2 = pd.DataFrame()
df2['userId'] = user_id
df2['movieId'] = movie_id
df2.to_csv('ratings.dat', index=False)

In [8]:
# Other files
df_tags = pd.read_csv('tags.csv')
df_links = pd.read_csv('links.csv')
df_movies = pd.read_csv('movies.csv')

# Filter
def filter_id(df):
    if 'userId' in df.columns:
        condition = np.isin(df['userId'], user_le.classes_, invert=True)
        condition |= np.isin(df['userId'], user_le.inverse_transform(where), invert=True)
        df = df.drop(index=df[condition].index)
        df['userId'] = user_le.transform(df['userId'])
    
    if 'movieId' in df.columns:
        condition = np.isin(df['movieId'], movie_le.classes_, invert=True)
        condition |= np.isin(df['movieId'], movie_le.inverse_transform(top_m), invert=True)
        df = df.drop(index=df[condition].index)
        df['movieId'] = movie_le.transform(df['movieId'])
    
    return df

# Replace ids
df_tags = filter_id(df_tags)
df_links = filter_id(df_links)
df_movies = filter_id(df_movies)

# Save files
df_tags.to_csv('tags.dat', index=False)
df_links.to_csv('links.dat', index=False)
df_movies.to_csv('movies.dat', index=False)