VECTORIZE TEXTUAL CONTENT FUNCTION DEFINITIONS

In [None]:
def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.

    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)

    return feature_vector

def generate_doc_vectors(docs, model, num_features):
    doc_vectors = [average_word_vectors(doc, model, model.wv.index_to_key, num_features) for doc in docs]
    return np.array(doc_vectors)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Determine the split index
split_index = int(0.8 * len(ads_dataframe))

train_data = ads_dataframe.iloc[:split_index]
test_data = ads_dataframe.iloc[split_index:]

X_train = train_data['ad_creative_bodies'].astype(str)
y_train = train_data['cpi']

X_test = test_data['ad_creative_bodies'].astype(str)
y_test = test_data['cpi']

# Tokenization and stopword removal
tokenized_text_train = X_train.apply(tokenize_and_remove_stopwords)
tokenized_text_test = X_test.apply(tokenize_and_remove_stopwords)

# Word2Vec model training
word2vec_model = Word2Vec(sentences=tokenized_text_train, vector_size=100, window=5, min_count=1, workers=4)

# Generate document vectors
X_train_word2vec = generate_doc_vectors(tokenized_text_train, word2vec_model, 100)
X_test_word2vec = generate_doc_vectors(tokenized_text_test, word2vec_model, 100)

word2vec_model.save("word2vec_model.model")

In [None]:
word2vec_model.save('/content/drive/MyDrive/CS491MLMODEL/us/us/word2vec_model.model')

TRAIN ML MODEL

In [None]:
# Model training and evaluation
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_regressor.fit(X_train_word2vec, y_train)

SAVE MODEL

In [None]:
joblib.dump(gb_regressor, '/content/drive/MyDrive/CS491MLMODEL/us/us/gradient_boosting_model.pkl')
#loaded_model = joblib.load('/content/drive/MyDrive/CS491MLMODEL/us/us/gradient_boosting_model.pkl')

LOAD MODEL (RUN IF YOU DO NOT WANT TO TRAIN)

In [None]:
gb_regressor = joblib.load('/content/drive/MyDrive/CS491MLMODEL/us/us/gradient_boosting_model.pkl')