In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Flatten, Concatenate, Dropout, Normalization, TextVectorization
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

2025-02-04 20:07:53.793035: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-04 20:07:53.990569: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-04 20:07:54.267635: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738699674.501058   21563 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738699674.567357   21563 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-04 20:07:55.187694: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

###  1. Load and Prepare Data

In [2]:
# Load data from CSV FILES
titles = pd.read_csv("data/Titles.csv")
opportunities = pd.read_csv("data/Opportunities.csv")
quotes = pd.read_csv("data/Quotes.csv")
quote_line_items = pd.read_csv("data/Quote_Line_Items.csv")

# Merge datasets
quote_qlis = quote_line_items.merge(quotes, on='quote_id')
opp_quote_qlis = quote_qlis.merge(opportunities, on='opportunity_id')
data = opp_quote_qlis.merge(titles, on='title_id')

merged_df = data
print(merged_df.columns)


Index(['title_id', 'quote_id', 'qli_fee', 'qli_start_date', 'qli_end_date',
       'qli_media_type', 'qli_rights', 'relevance', 'opportunity_id',
       'opportunity_name', 'opportunity_deal_amount', 'opp_start_date',
       'opp_expected_close_date', 'opportunity_geography_type',
       'opportunity_reporting_territory', 'opportunity_stage', 'title_name',
       'title_r', 'title_runtime', 'title_primary_genre', 'title_level',
       'title_production_year', 'title_synopsis'],
      dtype='object')


### 2. Define Features & Target

In [3]:
# Features
#opportunity_name
categorical_features = ["opportunity_geography_type", "opportunity_reporting_territory", "qli_media_type", "qli_rights", "title_primary_genre"]
numerical_features = ["opportunity_deal_amount", "title_runtime", "title_production_year"] 
date_features = ["opp_start_date", "opp_expected_close_date", "qli_start_date", "qli_end_date"]
text_features = ["title_synopsis"]

# Target: title_id (which title best fits the opportunity)
target = "title_name"

# TO DO: CHECK if there's a better fit for title_synopsis

### 3. Preprocess Data

In [4]:
# Encode categorical features
#label_encoders = {}
#for col in categorical_features:
#    le = LabelEncoder()
#    merged_df[col] = le.fit_transform(merged_df[col])
#    label_encoders[col] = le

# Encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    # Add "unknown" category to the encoder
    merged_df[col] = merged_df[col].astype(str)  # Ensure all values are strings
    le.fit(list(merged_df[col].unique()) + ["unknown"])  # Add "unknown" to the encoder
    merged_df[col] = le.transform(merged_df[col])
    label_encoders[col] = le

# Scale numerical features
scaler = StandardScaler()
merged_df[numerical_features] = scaler.fit_transform(merged_df[numerical_features])

# Handle date features
for col in date_features:
    merged_df[col] = pd.to_datetime(merged_df[col])
    merged_df[col] = (merged_df[col] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# Handle text features
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(merged_df['title_synopsis'])
sequences = tokenizer.texts_to_sequences(merged_df['title_synopsis'])
max_len = 100
text_data = pad_sequences(sequences, maxlen=max_len)

# Separate text data from other features
X_other = merged_df[categorical_features + numerical_features + date_features]
y = merged_df['relevance']

# Split data
X_train_other, X_test_other, X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_other, text_data, y, test_size=0.2, random_state=42
)


In [5]:
import joblib

# Save preprocessing objects
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(tokenizer, 'tokenizer.pkl')


['tokenizer.pkl']

In [6]:

# Save preprocessing objects
joblib.dump(label_encoders, 'code/label_encoders.pkl')
joblib.dump(scaler, 'code/scaler.pkl')
joblib.dump(tokenizer, 'code/tokenizer.pkl')

print('done')

done


### 4. Build Deep Learning Model

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Concatenate

# Define model
input_cat = Input(shape=(len(categorical_features),), name="input_cat")
input_num = Input(shape=(len(numerical_features) + len(date_features),), name="input_num")
input_text = Input(shape=(max_len,), name="input_text")

embedding = Embedding(input_dim=10000, output_dim=128)(input_text)
lstm = LSTM(64)(embedding)

concat = Concatenate()([input_cat, input_num, lstm])
dense1 = Dense(128, activation='relu')(concat)
output = Dense(1, activation='sigmoid')(dense1)

model = Model(inputs=[input_cat, input_num, input_text], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(
    [X_train_other[categorical_features], X_train_other[numerical_features + date_features], X_train_text],
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)


2025-02-04 20:08:03.011831: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 221ms/step - accuracy: 0.0000e+00 - loss: 56875880.0000 - val_accuracy: 0.0000e+00 - val_loss: 38509520.0000
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step - accuracy: 0.0000e+00 - loss: 35838108.0000 - val_accuracy: 0.0000e+00 - val_loss: 4183559.2500
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 98ms/step - accuracy: 0.0000e+00 - loss: 12387024.0000 - val_accuracy: 0.0000e+00 - val_loss: 26701482.0000
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 127ms/step - accuracy: 0.0000e+00 - loss: 23077386.0000 - val_accuracy: 0.0000e+00 - val_loss: 8478322.0000
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - accuracy: 0.0000e+00 - loss: 7944506.0000 - val_accuracy: 0.0000e+00 - val_loss: 8107619.0000
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 121ms/step - accuracy:

<keras.src.callbacks.history.History at 0x7f76c27d7f50>

## 5. Deploy to AWS SageMaker

In [8]:
import sagemaker
from sagemaker.tensorflow import TensorFlowModel
import tarfile
import os

# Save the model
#model = tf.keras.models.load_model('model.h5')

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [9]:
# Save model
#model.save('model.h5')

# Save the model in SavedModel format
#tf.saved_model.save(model, 'model') #saved model
# Define input signatures
input_signatures = {
    "input_cat": tf.TensorSpec(shape=(None, len(categorical_features)), dtype=tf.float32, name="input_cat"),
    "input_num": tf.TensorSpec(shape=(None, len(numerical_features) + len(date_features)), dtype=tf.float32, name="input_num"),
    "input_text": tf.TensorSpec(shape=(None, max_len), dtype=tf.int32, name="input_text"),
}

#model.export("1/") 
# Export the model
#tf.saved_model.save(model, '1/')
model.export(
    "1/"
    #input_tensors=input_signatures
)
print("Model saved in SavedModel format.")

# Verify saved structure
import os
print(os.listdir("1/")) # Should show ["saved_model.pb", "variables", "assets"]

INFO:tensorflow:Assets written to: 1/assets


INFO:tensorflow:Assets written to: 1/assets


Saved artifact at '1/'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): List[TensorSpec(shape=(None, 5), dtype=tf.float32, name='input_cat'), TensorSpec(shape=(None, 7), dtype=tf.float32, name='input_num'), TensorSpec(shape=(None, 100), dtype=tf.float32, name='input_text')]
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  140148050553424: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140148045545744: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140148045546896: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140148050552656: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140148050553808: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140148050553040: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140148045547856: TensorSpec(shape=(), dtype=tf.resource, name=None)
  140148045548240: TensorSpec(shape=(), dtype=tf.resource, name=None)
Model saved in SavedModel format.
['assets',

In [10]:
import tarfile
import os

# Create a tar.gz file
#with tarfile.open('model.tar.gz', 'w:gz') as tar:
#    tar.add('model', arcname=os.path.basename('1'))

#print("Model packaged into model.tar.gz.")

In [11]:
import tarfile

# List of files to include in the model.tar.gz
files_to_include = [
    '1/',  # TensorFlow SavedModel directory saved_model
    #'model.h5',
    'code/',
    #'inference.py',  # Custom inference script
    #'requirements.txt',
    #'label_encoders.pkl',  # Preprocessing object
    #'scaler.pkl',  # Preprocessing object
    #'tokenizer.pkl',  # Preprocessing object
    #'titles.csv'
]

# Create the model.tar.gz file
#with tarfile.open('model.tar.gz', 'w:gz') as tar:
#    for file in files_to_include:
#        tar.add(file)


# Create the model.tar.gz file
with tarfile.open('model.tar.gz', 'w:gz') as tar:
    for file in files_to_include:
        # If it's the model directory, rename it to just '1' inside the tarball
        #if file == '1/':
        #    tar.add(file, arcname='1')
        #if file == 'code/':
        #    tar.add(file, arcname='code')
        #else:
            tar.add(file)
            
print("model.tar.gz created successfully.")

model.tar.gz created successfully.


# 6. Upload to S3

In [12]:
# Upload to S3

import boto3

# Initialize S3 client
s3 = boto3.client('s3')
bucket_name = 'titlesbucket'
key = 'opportunity-title-prediction/model.tar.gz'

# Upload the file
#opportunity-title-prediction/model.tar.gz
s3.upload_file('model.tar.gz', bucket_name, key)

print(f"Model uploaded to s3://{bucket_name}/{key}")

Model uploaded to s3://titlesbucket/opportunity-title-prediction/model.tar.gz


In [13]:
from sagemaker.tensorflow import TensorFlowModel
from sagemaker import get_execution_role

# Get the SageMaker execution role
role = get_execution_role()

# Path to your new model.tar.gz in S3
# opportunity-title-prediction
model_data = "s3://titlesbucket/opportunity-title-prediction/model.tar.gz"

# Instantiate a TensorFlow model
tensorflow_model = TensorFlowModel(
    model_data=model_data,
    role=role,
    entry_point='inference.py',
    source_dir='code',
    framework_version='2.13.0',  # TensorFlow version
    name='Titles-Prediction-Model-21'
)


In [14]:
role = get_execution_role()
print(role)

arn:aws:iam::779846812857:role/service-role/AmazonSageMaker-ExecutionRole-20250126T233747


# END

In [15]:
# Create a SageMaker client
#sagemaker_client = boto3.client('sagemaker')

# Delete the existing endpoint configuration
#sagemaker_client.delete_endpoint_config(EndpointConfigName='titles-prediction-my-endpoint')


In [16]:
# Deploy the model
# m5
from datetime import datetime

ENDPOINT_NAME = 'Titles-Prediction-Model-21-' + str(datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S-%f'))

predictor = tensorflow_model.deploy(
    initial_instance_count=1,
    instance_type='ml.c5.large',  
    endpoint_name=ENDPOINT_NAME 
    #update_endpoint=True
)

print('Model deployment finished')
print(predictor)

-------------------------------------------*

UnexpectedStatusException: Error hosting endpoint Titles-Prediction-Model-21-2025-02-04-20-09-16-504232: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.. Try changing the instance type or reference the troubleshooting page https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-troubleshooting.html

In [188]:
import os
print(os.listdir("/opt/ml/model"))  # Should show ['1', 'code']
print(os.listdir("/opt/ml/model/1"))  # Should show model files
print(os.listdir("/opt/ml/model/code"))  # Should show inference.py, encoders, scaler, tokenizer, etc.


FileNotFoundError: [Errno 2] No such file or directory: '/opt/ml/model'

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# Load data from CSV files
titles = pd.read_csv("data/Titles.csv")
opportunities = pd.read_csv("data/Opportunities.csv")
quotes = pd.read_csv("data/Quotes.csv")
quote_line_items = pd.read_csv("data/Quote_Line_Items.csv")

# Merge datasets
quote_qlis = quote_line_items.merge(quotes, on='quote_id')
opp_quote_qlis = quote_qlis.merge(opportunities, on='opportunity_id')
data = opp_quote_qlis.merge(titles, on='title_id')

merged_df = data

# Features
categorical_features = ["opportunity_geography_type", "opportunity_reporting_territory", "qli_media_type", "qli_rights", "title_primary_genre"]
numerical_features = ["opportunity_deal_amount", "title_runtime", "title_production_year"]
date_features = ["opp_start_date", "opp_expected_close_date", "qli_start_date", "qli_end_date"]
text_features = ["title_synopsis"]

# Load pre-trained model
model = load_model('model.h5')

# Load preprocessing objects (label encoders, scaler, tokenizer)
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    # Add "unknown" category to the encoder
    merged_df[col] = merged_df[col].astype(str)  # Ensure all values are strings
    le.fit(list(merged_df[col].unique()) + ["unknown"])  # Add "unknown" to the encoder
    merged_df[col] = le.transform(merged_df[col])
    label_encoders[col] = le

scaler = StandardScaler()
scaler.fit(merged_df[numerical_features])

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(merged_df['title_synopsis'])

# Function to preprocess opportunity data
def preprocess_opportunity_data(opportunity_data, titles_df, categorical_features, numerical_features, date_features, text_features, label_encoders, scaler, tokenizer, max_len=100):
    """
    Preprocess opportunity data and combine it with each title's features.
    Handles unseen categorical labels by mapping them to an "unknown" category.
    """
    # Preprocess opportunity data
    X_cat = []
    for col in categorical_features:
        le = label_encoders[col]
        # Handle unseen labels by mapping them to "unknown"
        if opportunity_data.get(col, "unknown") in le.classes_:
            X_cat.append(le.transform([opportunity_data.get(col, "unknown")])[0])
        else:
            # Map unseen labels to "unknown"
            X_cat.append(le.transform(["unknown"])[0])
    X_cat = np.array(X_cat).reshape(1, -1)  # Shape: (1, num_categorical_features)

    X_num = np.array([[opportunity_data.get(col, 0) for col in numerical_features]])
    X_num = scaler.transform(X_num)  # Scale numerical features

    for col in date_features:
        date_value = datetime.strptime(opportunity_data.get(col, "1970-01-01"), "%Y-%m-%d")
        days_since_reference = (date_value - datetime(1970, 1, 1)).days
        X_num = np.append(X_num, [[days_since_reference]], axis=1)  # Shape: (1, num_numerical_features + num_date_features)

    X_text = tokenizer.texts_to_sequences([opportunity_data.get(text_features[0], "")])
    X_text = pad_sequences(X_text, maxlen=max_len)  # Shape: (1, max_len)

    # Combine opportunity data with each title's features
    X_cat_all = np.tile(X_cat, (len(titles_df), 1))  # Repeat opportunity data for each title
    X_num_all = np.tile(X_num, (len(titles_df), 1))  # Repeat opportunity data for each title

    X_text_all = []
    for title_synopsis in titles_df['title_synopsis']:
        seq = tokenizer.texts_to_sequences([title_synopsis])
        padded_seq = pad_sequences(seq, maxlen=max_len)
        X_text_all.append(padded_seq[0])
    X_text_all = np.array(X_text_all)  # Shape: (num_titles, max_len)

    return X_cat_all, X_num_all, X_text_all

# Opportunity data
opportunity_data = {
    "opportunity_name": "Example Opportunity",
    "opportunity_geography_type": "Region",
    "opportunity_reporting_territory": "Territory",
    "opp_start_date": "2023-01-01",
    "opp_expected_close_date": "2023-12-31",
    "title_synopsis": "This is a synopsis of the title."
}

# Preprocess opportunity data
X_cat_all, X_num_all, X_text_all = preprocess_opportunity_data(
    opportunity_data, titles, categorical_features, numerical_features, date_features, text_features, label_encoders, scaler, tokenizer
)

# Make predictions
predictions = model.predict([X_cat_all, X_num_all, X_text_all])

# Add predictions to titles dataframe
titles['relevance_score'] = predictions

# Sort titles by relevance score
sorted_titles = titles.sort_values(by='relevance_score', ascending=False)

# Display top 10 titles
print("Top 10 Titles for the Opportunity:")
print(sorted_titles[['title_name', 'relevance_score']].head(10))

In [21]:
import os
import shutil
import boto3

# File paths
model_file = 'model.h5'
label_encoders_file = 'label_encoders.pkl'
package_dir = 'model_package'
tar_gz_file = 'model.tar.gz'
bucket_name = 'titlesbucket'
s3_model_path = f's3://{bucket_name}/opportunity-title-prediction/{tar_gz_file}'

# 1. Create a folder to package the model
if not os.path.exists(package_dir):
    os.makedirs(package_dir)

# 2. Copy the model and other necessary files to the package folder
shutil.copy(model_file, package_dir)
shutil.copy(label_encoders_file, package_dir)

# 3. Create a .tar.gz archive from the model_package folder
shutil.make_archive(tar_gz_file.replace('.tar.gz', ''), 'gztar', package_dir)

# 4. Upload the .tar.gz archive to S3
s3_client = boto3.client('s3')
s3_client.upload_file(tar_gz_file, bucket_name, f'opportunity-title-prediction/{tar_gz_file}')

# 5. Confirm the file is uploaded
print(f"Model archive uploaded to: {s3_model_path}")

# Clean up: Remove the temporary directory and archive
#shutil.rmtree(package_dir)
#os.remove(tar_gz_file)


Model archive uploaded to: s3://titlesbucket/opportunity-title-prediction/model.tar.gz


In [None]:
#NOOOOO
#role = sagemaker.get_execution_role()

# Upload model to S3
#agemaker_session = sagemaker.Session()
#bucket = 'titlesbucket'
#prefix = 'opportunity-title-prediction'
#model_data = sagemaker_session.upload_data(path='model.h5', bucket=bucket, key_prefix=prefix)


In [None]:
# NOOOOOOO
# Create SageMaker model
#role = sagemaker.get_execution_role()
#tensorflow_model = TensorFlowModel(model_data=model_data, role=role, framework_version='2.4.1')

# Deploy model
#predictor = tensorflow_model.deploy(initial_instance_count=1, instance_type='ml.m5.large')

#print('finished')
#print(predictor)

# TEST