# Import libraries and data

In [8]:
import pandas as pd

# Get movie and rating info
movies = pd.read_csv('archive//ml-100k/u.item', sep='|', names=['movieId', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], encoding='latin-1')
ratings = pd.read_csv('archive/ml-100k/u.data', sep='\t', header=None, names=['userId', 'movieId', 'rating', 'timestamp'])

# Data cleaning

Genres for movies are currently in binary form (i.e 1 indicates movie belongs in that genre, 0 indicates it does not). We want to convert the columns into a single string to capture all the genres for each movie. This will allow us to simplify the feature extraction process as the TF-IDF vectorizer will be able to treat the combined genres as a single text feature.

In [9]:
# Combine genre columns into a single string
genre_columns = ['Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
                  'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies['genres'] = movies[genre_columns].apply(lambda row: ' '.join([genre for genre in genre_columns if row[genre] == 1]), axis=1)

# Combine 'title' and 'genres' to get as much info as possible for content-based filtering
movies['combined_info'] = movies['title'] + ' ' + movies['genres']

# Save cleaned data

In [10]:
movies.to_pickle('movies_df.pkl')
ratings.to_pickle('ratings_df.pkl')