In [38]:
import re
import os
import boto3
import pandas as pd
from tqdm import tqdm
from decimal import Decimal
from zipfile import ZipFile
from urllib.request import urlretrieve

In [39]:
urlretrieve(
    "http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
ZipFile("movielens.zip", "r").extractall()

In [40]:
movies = pd.read_csv(
    "ml-1m/movies.dat",
    sep="::",
    names=["movie_id", "title", "genres"],
    engine="python",
    encoding="ISO-8859-1",
)

In [41]:
ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    engine="python",
)

In [42]:
ratings = ratings.groupby(['movie_id'])[
    'rating'].mean().reset_index(name="rating")

In [43]:
def round_to_half(x):
    lower = int(x)
    middle = lower + 0.5
    upper = lower+1

    if x >= middle:
        if abs(x-upper) > abs(x-middle):
            return middle
        else:
            return upper

    if x < middle:
        if abs(x-lower) > abs(x-middle):
            return middle
        else:
            return lower

In [44]:
ratings['rating'] = ratings['rating'].apply(lambda x: round_to_half(x))

In [45]:
movies = movies.merge(ratings)

In [46]:
def extract_year(movie_title):
    # Regular expression pattern to match the year in parentheses
    pattern = r'\((\d{4})\)'
    match = re.search(pattern, movie_title)
    if match:
        return match.group(1)
    else:
        return None

In [47]:
def remove_year(movie_title):
    # Regular expression pattern to match the year in parentheses
    pattern = r'\((\d{4})\)'
    movie_title = re.sub(pattern, "", movie_title)
    return movie_title

In [48]:
movies['release_year'] = movies['title'].apply(
    lambda x: extract_year(x)).astype("int")

In [49]:

movies['title'] = movies['title'].apply(lambda x: remove_year(x).strip())

In [50]:
movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))

In [51]:
movies.head()

Unnamed: 0,movie_id,title,genres,rating,release_year
0,1,Toy Story,"[Animation, Children's, Comedy]",4.0,1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",3.0,1995
2,3,Grumpier Old Men,"[Comedy, Romance]",3.0,1995
3,4,Waiting to Exhale,"[Comedy, Drama]",2.5,1995
4,5,Father of the Bride Part II,[Comedy],3.0,1995


In [52]:
def list_s3_files(bucket_name, prefix=''):
    s3 = boto3.client('s3')
    files = []
    continuation_token = None

    while True:
        if continuation_token:
            response = s3.list_objects_v2(
                Bucket=bucket_name, Prefix=prefix, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

        for obj in response['Contents']:
            files.append(obj['Key'])

        if 'NextContinuationToken' in response:
            continuation_token = response['NextContinuationToken']
        else:
            break

    return files

In [53]:
file_paths = list_s3_files("cloudfront-aws-bucket",
                           prefix='ml-1m-cover-images')

In [54]:
df_images = pd.DataFrame()

In [55]:
df_images['image_path'] = file_paths

In [56]:
df_images = df_images.iloc[1:].copy()

In [57]:
df_images['movie_id'] = df_images['image_path'].apply(
    lambda x: os.path.basename(os.path.dirname(x))).astype('int')

In [58]:
df_images['image_url'] = df_images['image_path'].apply(
    lambda x: os.path.join("http://d2gewc5xha837s.cloudfront.net", x))

In [59]:
df_images = df_images.drop("image_path", axis=1)

In [60]:
movies = movies.merge(df_images, on=['movie_id'])

In [61]:
movies.head()

Unnamed: 0,movie_id,title,genres,rating,release_year,image_url
0,1,Toy Story,"[Animation, Children's, Comedy]",4.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...
1,2,Jumanji,"[Adventure, Children's, Fantasy]",3.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...
2,3,Grumpier Old Men,"[Comedy, Romance]",3.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...
3,4,Waiting to Exhale,"[Comedy, Drama]",2.5,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...
4,5,Father of the Bride Part II,[Comedy],3.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...


In [62]:
movies_genres = movies.explode(['genres'])

In [63]:
movies_genres = movies_genres.rename(columns={'genres': "genre"})

In [64]:
movies_genres.head()

Unnamed: 0,movie_id,title,genre,rating,release_year,image_url
0,1,Toy Story,Animation,4.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...
0,1,Toy Story,Children's,4.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...
0,1,Toy Story,Comedy,4.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...
1,2,Jumanji,Adventure,3.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...
1,2,Jumanji,Children's,3.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...


In [65]:
movies_genres['rank'] = movies_genres.groupby('movie_id')['genre'].rank()

In [66]:
movies_genres['rank'] = movies_genres['rank'].astype("int")

In [67]:
movies_genres.head()

Unnamed: 0,movie_id,title,genre,rating,release_year,image_url,rank
0,1,Toy Story,Animation,4.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...,1
0,1,Toy Story,Children's,4.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...,2
0,1,Toy Story,Comedy,4.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...,3
1,2,Jumanji,Adventure,3.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...,1
1,2,Jumanji,Children's,3.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...,2


In [68]:
movies_genres = movies_genres.merge(
    movies[['movie_id', 'genres']], on=['movie_id'])

In [69]:
movies_genres.head()

Unnamed: 0,movie_id,title,genre,rating,release_year,image_url,rank,genres
0,1,Toy Story,Animation,4.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...,1,"[Animation, Children's, Comedy]"
1,1,Toy Story,Children's,4.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...,2,"[Animation, Children's, Comedy]"
2,1,Toy Story,Comedy,4.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...,3,"[Animation, Children's, Comedy]"
3,2,Jumanji,Adventure,3.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...,1,"[Adventure, Children's, Fantasy]"
4,2,Jumanji,Children's,3.0,1995,http://d2gewc5xha837s.cloudfront.net/ml-1m-cov...,2,"[Adventure, Children's, Fantasy]"


In [72]:
genres = list(set(movies_genres['genre']))

In [None]:
genres = 

In [33]:
dynamodb = boto3.resource("dynamodb")

In [34]:
table_name = "movielens_movie"

In [35]:
table = dynamodb.Table(table_name)

In [36]:
items = []
for i in range(len(movies_genres)):
    item = movies_genres.iloc[i].to_dict()
    item["rating"] = Decimal(str(item["rating"]))
    items.append(item)

In [37]:
with table.batch_writer() as writer:
    for item in tqdm(items):
        writer.put_item(Item=item)

  0%|          | 0/6171 [00:00<?, ?it/s]

100%|██████████| 6171/6171 [05:13<00:00, 19.70it/s]
