In [None]:
import re
import os
import boto3
import pandas as pd
from tqdm import tqdm
from zipfile import ZipFile
from urllib.request import urlretrieve

In [None]:
urlretrieve(
    "http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
ZipFile("movielens.zip", "r").extractall()

In [None]:
movies = pd.read_csv(
    "ml-1m/movies.dat",
    sep="::",
    names=["movie_id", "title", "genres"],
    engine="python",
    encoding="ISO-8859-1",
)

In [None]:
def extract_year(movie_title):
    pattern = r'\((\d{4})\)'  # Regular expression pattern to match the year in parentheses
    match = re.search(pattern, movie_title)
    if match:
        return match.group(1)
    else:
        return None

In [None]:
def remove_year(movie_title):
    pattern = r'\((\d{4})\)'  # Regular expression pattern to match the year in parentheses
    movie_title = re.sub(pattern,"",movie_title)
    return movie_title

In [None]:
movies['release_year'] = movies['title'].apply(lambda x: extract_year(x)).astype("int")

In [None]:

movies['title'] = movies['title'].apply(lambda x: remove_year(x))

In [None]:
movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))

In [None]:
movies.head()

In [None]:
def list_s3_files(bucket_name, prefix=''):
    s3 = boto3.client('s3')
    files = []
    continuation_token = None
    
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
        
        for obj in response['Contents']:
            files.append(obj['Key'])
        
        if 'NextContinuationToken' in response:
            continuation_token = response['NextContinuationToken']
        else:
            break
    
    return files

In [None]:
file_paths = list_s3_files("cloudfront-aws-bucket", prefix='ml-1m-cover-images')

In [None]:
df_images = pd.DataFrame()

In [None]:
df_images['image_path'] = file_paths

In [None]:
df_images = df_images.iloc[1:].copy()

In [None]:
df_images['movie_id'] = df_images['image_path'].apply(lambda x: os.path.basename(os.path.dirname(x))).astype('int')

In [None]:
df_images['image_url'] = df_images['image_path'].apply(lambda x: os.path.join("http://d2gewc5xha837s.cloudfront.net",x))

In [None]:
df_images = df_images.drop("image_path",axis=1)

In [None]:
movies = movies.merge(df_images,on=['movie_id']) 

In [None]:
movies.head()

In [None]:

movies_genres = movies.explode(['genres'])

In [None]:
movies_genres = movies_genres.rename(columns={'genres':"genre"})

In [None]:
movies_genres.head()

In [None]:
movies_genres['rank'] = movies_genres.groupby('movie_id')['genre'].rank('max')

In [None]:
movies_genres['is_first'] = movies_genres['rank']==1

In [None]:
movies_genres = movies_genres.drop("rank",axis=1)

In [None]:
movies_genres.head()

In [None]:
dynamodb = boto3.resource("dynamodb")

In [None]:
table_name = "movielens_movie"

In [None]:
table = dynamodb.Table(table_name)

In [None]:
items = []
for i in range(len(movies_genres)):
    item = movies_genres.iloc[i].to_dict()
    items.append(item)

In [None]:
with table.batch_writer() as writer:
    for item in tqdm(items):
        writer.put_item(Item=item)