In [None]:
import os
import math
import boto3  
import sagemaker
import pandas as pd
import numpy as np
from datetime import datetime
import tensorflow as tf
from sagemaker.tensorflow import TensorFlow
from sagemaker.tensorflow import TensorFlowModel

# Set up the SageMaker session
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
bucket = sess.default_bucket()
region = sess.boto_session.region_name

In [None]:
%%bash  
# Delete the data directory if it already exists  
rm -r data

# Create a new directory
mkdir data
mkdir data/s3

# Get the training data
python3 getData.py

In [None]:
def train_test_split(df):  
    number_of_communities = len(df.columns)
    holdout_num = math.floor(number_of_communities * 0.2)
    columns = ['userId', 'community', 'interaction']

    # get sets with userId as index
    df_train = pd.DataFrame(columns=columns)
    df_test = pd.DataFrame(columns=columns)
    for userId in df.index:
        communities = np.random.choice(df.columns, size=holdout_num, replace=False)
        for i in df.columns:
            if i in communities:
                df_test.loc[len(df_test)] = [userId, i, df.loc[userId][i]]
            else:
                df_train.loc[len(df_train)] = [userId, i, df.loc[userId][i]]
      
    return df_train, df_test  

def negative_sampling(interactions_train, items, n_neg):  
    neg = []
    user_ids = interactions_train.userId.values
      
    # for every positive label case  
    for user_id in user_ids:  
        # generate n_neg negative labels  
        communities_rated = interactions_train[interactions_train['userId'] == user_id]['community']
        communities_not_rated = np.setdiff1d(items, communities_rated)
        negative_communities = np.random.choice(communities_not_rated, size=n_neg)
        for community in negative_communities:
            neg.append([user_id, community, 0])
              
    # convert to pandas dataframe for concatenation later  
    df_neg = pd.DataFrame(neg, columns=['userId', 'community', 'interaction'])  
      
    return df_neg

In [None]:
# Cargar datos de la carpet data
interactions_df = pd.read_csv('data/interactions.csv')
interactions_df = interactions_df.set_index('userId')
users_vectors_df = pd.read_csv('data/users_vectors.csv')
users_vectors_df = users_vectors_df.set_index('id')

# perform train test split    
interactions_train, interactions_test = train_test_split(interactions_df)

# create 5 negative samples per positive label for training set    
neg_train = negative_sampling(    
    interactions_train=interactions_train,
    items=interactions_df.columns,
    n_neg=5
)

# create final training and testing sets
interactions_train = interactions_train[['userId', 'community']].assign(interaction=1)
interactions_train = pd.concat([interactions_train, neg_train], ignore_index=True)

interactions_test = interactions_test[['userId', 'community']].assign(interaction=1)

# save data locally first
dest = 'data/s3'
train_path = os.path.join(dest, 'interactions_train.npy')
test_path = os.path.join(dest, 'interactions_test.npy')
interactions_path = os.path.join(dest, 'interactions.csv')
users_vectors_path = os.path.join(dest, 'users_vectors.csv')
np.save(train_path, interactions_train.values)
np.save(test_path, interactions_test.values)
interactions_df.to_csv(interactions_path)
users_vectors_df.to_csv(users_vectors_path)
    
# store data in the default S3 bucket  
print("the default bucket name is", bucket)  
  
# upload to the default s3 bucket
sess.upload_data(train_path, key_prefix='data')
sess.upload_data(test_path, key_prefix='data')
sess.upload_data(interactions_path, key_prefix='data')
sess.upload_data(users_vectors_path, key_prefix='data')

In [None]:
date = datetime.now().strftime("%y%m%d-%H%M%S")

instance_type = 'ml.p2.xlarge'
device = 'gpu'
epochs = '3'

job_name = '{}-recomendation-{}-{}-{}e'.format(
    date,
    instance_type.replace('.','-').replace('ml-', ''),
    device,
    epochs)

In [None]:
ncf_estimator = TensorFlow(  
    entry_point='ncf.py',  
    role=role,  
    instance_count=1,  
    instance_type=instance_type,
    framework_version='2.1.0',
    py_version='py3',
    distributions={'parameter_server': {'enabled': True}},
    hyperparameters={'epochs': epochs, 'batch_size': 256}
)

In [None]:
training_data_uri = os.path.join(f's3://{bucket}', 'data')  
ncf_estimator.fit(training_data_uri, wait=True, job_name=job_name)

In [None]:
# Crear un modelo SageMaker a partir del modelo entrenado
tensorflow_model = TensorFlowModel(
    model_data=ncf_estimator.model_data,
    role=role,
    framework_version='2.1.0',
    # image_uri='123456789012.dkr.ecr.tu-region.amazonaws.com/tu-repositorio-inferencia:tu-tag'
)

In [None]:
date = datetime.now().strftime("%y%m%d-%H%M%S")
instance_type = 'ml.c5.xlarge'
endpoint_name = '{}-recomendation-model'.format(date)
model_name = "neural-collab-filtering-model"
predictor = tensorflow_model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    endpoint_name=endpoint_name,
    model_name=model_name)

In [None]:
predictor.endpoint_name