In [1]:
import os
import pandas as pd


# Change work directory to /artifact
print(os.getcwd())
os.chdir("../data/downloads")
print(os.getcwd())

/home/cory/PycharmProjects/bachelor_2022/artifact/data_fetch
/home/cory/PycharmProjects/bachelor_2022/artifact/data/downloads


In [2]:
ds_name = 'ml-100k'
!wget https://files.grouplens.org/datasets/movielens/{ds_name}.zip
!unzip {ds_name}.zip
!rm {ds_name}.zip

--2022-07-15 00:51:34--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2022-07-15 00:51:36 (4.86 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inf

In [3]:
# Instantiate all needed dataframes from download files

# Instantiate DF of user-movie rating
data_cols = ['user_id', 'movie_id', 'user_rating', 'timestamp']
data_df = pd.read_csv(f'{ds_name}/u.data', sep='\t', names=data_cols)
print(data_df.sample(1))

# Instantiate DF of user demography
user_cols = ['user_id', 'raw_user_age', 'user_gender', 'user_occupation_text', 'user_zip_code']
user_df = pd.read_csv(f'{ds_name}/u.user', sep='|', names=user_cols)
print(user_df.sample(1))

movie_cols = ['movie_id', 'movie_name', 'release_date', ' ','link', 'genre1', 'genre2', 'genre3', 'genre4', 'genre5', 'genre6', 'genre7', 'genre8', 'genre9', 'genre10', 'genre11', 'genre12', 'genre13', 'genre14', 'genre15', 'genre16', 'genre17', 'genre18', 'genre19', 'genre20']
movie_df = pd.read_csv(f'{ds_name}/u.item', sep='|', names=movie_cols, encoding = "ISO-8859-1")
# keep columns movie_id, movie_name, release_date
movie_df = movie_df.iloc[:, :3]
movie_df.sample(1)

       user_id  movie_id  user_rating  timestamp
40664      655       162            3  888474165
     user_id  raw_user_age user_gender user_occupation_text user_zip_code
387      388            31           M                other         36106


Unnamed: 0,movie_id,movie_name,release_date
1326,1327,Captives (1994),16-Sep-1994


In [4]:
# Merge individual dfs into one

# Augment data_df with user_df
df = data_df.merge(user_df, on='user_id')
# Augment df with movie_df
df = df.merge(movie_df, on='movie_id')
df

Unnamed: 0,user_id,movie_id,user_rating,timestamp,raw_user_age,user_gender,user_occupation_text,user_zip_code,movie_name,release_date
0,196,242,3,881250949,49,M,writer,55105,Kolya (1996),24-Jan-1997
1,305,242,5,886307828,23,M,programmer,94086,Kolya (1996),24-Jan-1997
2,6,242,4,883268170,42,M,executive,98101,Kolya (1996),24-Jan-1997
3,234,242,4,891033261,60,M,retired,94702,Kolya (1996),24-Jan-1997
4,63,242,3,875747190,31,M,marketing,75240,Kolya (1996),24-Jan-1997
...,...,...,...,...,...,...,...,...,...,...
99995,863,1679,3,889289491,17,M,student,60089,B. Monkey (1998),06-Feb-1998
99996,863,1678,1,889289570,17,M,student,60089,Mat' i syn (1997),06-Feb-1998
99997,863,1680,2,889289570,17,M,student,60089,Sliding Doors (1998),01-Jan-1998
99998,896,1681,3,887160722,28,M,writer,91505,You So Crazy (1994),01-Jan-1994


In [5]:
# Simple data augmentation

# Bucketize user age (as seen in tfds)
def bucketizer(age):
    if age < 18:
        return 1
    elif age >= 18 and age < 25:
        return 18
    elif age >= 25 and age < 35:
        return 25
    elif age >= 35 and age < 45:
        return 35
    elif age >= 45 and age < 50:
        return 45
    elif age >= 50 and age < 55:
        return 50
    else:
        return 56

df['bucketized_user_age'] = df['raw_user_age'].apply(bucketizer)

# Convert gender to bool value
df['bool_user_gender'] = df['user_gender'].map({'M': True, 'F': False})
df

# Extract year as int from release date
# def year_extractor(date):
#     return date.split('-')[2]
#
# df['release_year'] = df['release_date'].apply(year_extractor)
# df

Unnamed: 0,user_id,movie_id,user_rating,timestamp,raw_user_age,user_gender,user_occupation_text,user_zip_code,movie_name,release_date,bucketized_user_age,bool_user_gender
0,196,242,3,881250949,49,M,writer,55105,Kolya (1996),24-Jan-1997,45,True
1,305,242,5,886307828,23,M,programmer,94086,Kolya (1996),24-Jan-1997,18,True
2,6,242,4,883268170,42,M,executive,98101,Kolya (1996),24-Jan-1997,35,True
3,234,242,4,891033261,60,M,retired,94702,Kolya (1996),24-Jan-1997,56,True
4,63,242,3,875747190,31,M,marketing,75240,Kolya (1996),24-Jan-1997,25,True
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,863,1679,3,889289491,17,M,student,60089,B. Monkey (1998),06-Feb-1998,1,True
99996,863,1678,1,889289570,17,M,student,60089,Mat' i syn (1997),06-Feb-1998,1,True
99997,863,1680,2,889289570,17,M,student,60089,Sliding Doors (1998),01-Jan-1998,1,True
99998,896,1681,3,887160722,28,M,writer,91505,You So Crazy (1994),01-Jan-1994,25,True


In [6]:
os.chdir("../recommender-dataset/")
df.to_csv(f"{ds_name}_augmented.csv", index=False)