In [1]:
import pandas as pd
import numpy as np
import requests
import zipfile
import warnings
import os
warnings.filterwarnings('ignore')

In [13]:
data_url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
def save_zip_file_from_url(data_url, download_path=os.getcwd(), chunk_size=128):
    current_directory = download_path.replace(os.sep, '/')
    response = requests.get(data_url, stream=True)
    with open(current_directory + '/data/movies.zip', 'wb') as dest_file:
        for chunk in response.iter_content(chunk_size=chunk_size):
            dest_file.write(chunk)

In [20]:
if not os.path.exists(os.getcwd() + '/data/movies.zip'):
    save_zip_file_from_url(data_url)

In [19]:
zip_files = zipfile.ZipFile('./data/movies.zip', "r")
zip_files.extractall()
print(zip_files.read('ml-100k/u.info'))

b'943 users\n1682 items\n100000 ratings\n'


In [21]:
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users_data = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')
users_data.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [22]:
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_data = pd.read_csv(
    'ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')
ratings_data.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [23]:
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies_data = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')
movies_data.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [39]:
users_data["user_id"] = users_data["user_id"].apply(lambda x: str(x-1))
movies_data["movie_id"] = movies_data["movie_id"].apply(lambda x: str(x-1))
movies_data["year"] = movies_data['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings_data["movie_id"] = ratings_data["movie_id"].apply(lambda x: str(x-1))
ratings_data["user_id"] = ratings_data["user_id"].apply(lambda x: str(x-1))
ratings_data["rating"] = ratings_data["rating"].apply(lambda x: float(x))

In [37]:
movies_data.drop(['video_release_date'], axis=1, inplace=True)

In [38]:
movies_data.head()

Unnamed: 0,movie_id,title,release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,0,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1995
1,1,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
2,2,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
3,3,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
4,4,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995


In [42]:
full_data = ratings_data.merge(movies_data, on='movie_id').merge(users_data, on='user_id')
full_data.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,release_date,imdb_url,genre_unknown,Action,Adventure,...,Romance,Sci-Fi,Thriller,War,Western,year,age,sex,occupation,zip_code
0,195,241,3.0,881250949,Kolya (1996),24-Jan-1997,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,0,...,0,0,0,0,0,1997,49,M,writer,55105
1,195,256,2.0,881251577,Men in Black (1997),04-Jul-1997,http://us.imdb.com/M/title-exact?Men+in+Black+...,0,1,1,...,0,1,0,0,0,1997,49,M,writer,55105
2,195,110,4.0,881251793,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,http://us.imdb.com/M/title-exact?Truth%20About...,0,0,0,...,1,0,0,0,0,1996,49,M,writer,55105
3,195,24,4.0,881251955,"Birdcage, The (1996)",08-Mar-1996,"http://us.imdb.com/M/title-exact?Birdcage,%20T...",0,0,0,...,0,0,0,0,0,1996,49,M,writer,55105
4,195,381,4.0,881251843,"Adventures of Priscilla, Queen of the Desert, ...",01-Jan-1994,http://us.imdb.com/M/title-exact?Adventures%20...,0,0,0,...,0,0,0,0,0,1994,49,M,writer,55105


In [43]:
full_data.to_csv('./data/full_movielens_data.csv', index=False)

In [44]:
users_data.to_csv('./data/users.csv', index=False)

In [45]:
movies_data.to_csv('./data/movies.csv', index=False)

In [46]:
ratings_data.to_csv('./data/ratings.csv', index=False)