IMPORTS

In [3]:
!pip install ijson
import pandas as pd
import os
from tqdm import tqdm
import ijson
import json
import gzip
import ijson
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from google.colab import drive
drive.mount('/content/drive')

ads_dataset_path = '/content/drive/MyDrive/CS491MLMODEL/us/us/ads.json'
ads_downsized_path = '/content/drive/MyDrive/CS491MLMODEL/us/us/ads_downsized.json'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


DATA DOWNSIZING

In [11]:
json_item_heuristic = 767263
counter = 0

with open(ads_dataset_path, 'r') as input_file:
    parser = ijson.items(input_file, 'item')

    with open(ads_downsized_path, 'a') as output_file:  # Open output file once
        output_file.write('[\n')
        # Use tqdm to create a progress bar
        for item in tqdm(parser, total=int(json_item_heuristic), unit=" items"):
            if counter > json_item_heuristic:
                break
            counter += 1
            json.dump(item, output_file)
            if counter > json_item_heuristic:
              output_file.write('\n')  # Add a newline between items
            else:
              output_file.write(',\n')  # Add a newline between items

        output_file.write(']\n')

767264 items [05:44, 2227.86 items/s]                        


DATA DUPLICATE REMOVAL

In [None]:
import json
ads_non_duplicate_path = '/content/drive/MyDrive/CS491MLMODEL/us/us/ads_non_duplicate.json'

def remove_duplicates(filename, column, output_filename):
    json_item_heuristic = 767263
    first_item = True
    removed_content = 0
    unique_values = set()

    with open(filename, 'rb') as file:
        items = ijson.items(file, 'item')

        with open(output_filename, 'w') as outfile:
            outfile.write('[\n')
            for item in tqdm(items, total=int(json_item_heuristic), unit=" items"):                    
                if 'ad_creative_bodies' in item:
                    column_value = tuple(item['ad_creative_bodies'])

                    if column_value not in unique_values:
                        unique_values.add(column_value)
                        if first_item:
                            json.dump(item, outfile)
                            first_item = False
                        else:
                            outfile.write(',\n')
                            json.dump(item, outfile)
                    else:
                        removed_content += 1
                else:
                    removed_content += 1
                    pass
            outfile.write(']\n')
    print(f"Amount of ads that have been filtered: {removed_content}")


remove_duplicates(ads_downsized_path, 'a', ads_non_duplicate_path)

JSON TO CSV CONVERSION

In [None]:
import pandas as pd

ads_non_duplicate_path = '/content/drive/MyDrive/CS491MLMODEL/us/us/ads_non_duplicate.json'
ads_dataframe = '/content/drive/MyDrive/CS491MLMODEL/us/us/ads_dataframe.csv'

# Load JSON data
with open(ads_non_duplicate_path, 'r') as file:
    data = json.load(file)

# Process JSON data and keep certain attributes
processed_data = []
for item in data:
    # Keep only certain attributes, modify as needed
    selected_attributes = {'ad_creation_time': item['ad_creation_time'], 
                           'ad_creative_bodies': item['ad_creative_bodies'], 
                           'currency': item['currency'],
                           'impressions': item['impressions'],
                           'spend': item['spend']}
    processed_data.append(selected_attributes)

# Create a DataFrame
df = pd.DataFrame(processed_data)

# Save DataFrame to CSV
df.to_csv(ads_dataframe, index=False)

DATAFRAME INFORMATION

In [None]:
df = pd.read_csv(ads_dataframe)
print(df.dtypes)
df.head()

CONVERT INTO DATAFRAME

In [None]:
import ast
# Extract relevant values into new DataFrame
new_df = pd.DataFrame()

new_df['ad_creation_time'] = pd.to_datetime(df['ad_creation_time'])
new_df['ad_creative_bodies'] = df['ad_creative_bodies'].apply(lambda x: ast.literal_eval(x)[0] if pd.notnull(x) else x).astype(str)


# Extract 'lower_bound' and 'upper_bound' values if they exist, otherwise use NaN
new_df['impressions_lower'] = df['impressions'].apply(lambda x: ast.literal_eval(x).get('lower_bound', None)).astype("float")
new_df['impressions_upper'] = df['impressions'].apply(lambda x: ast.literal_eval(x).get('upper_bound', None)).astype("float")

new_df['spend_lower'] = df['spend'].apply(lambda x: ast.literal_eval(x).get('lower_bound', None)).astype("float")
new_df['spend_upper'] = df['spend'].apply(lambda x: ast.literal_eval(x).get('upper_bound', None)).astype("float")

# Display the new DataFrame
print(new_df.dtypes)

new_df.head()

ads_dataframe = pd.DataFrame()
ads_dataframe['ad_creation_time'] = new_df['ad_creation_time']
ads_dataframe['ad_creative_bodies'] = new_df['ad_creative_bodies']
ads_dataframe['cpi'] = (
    (new_df['spend_lower'].fillna(new_df['spend_upper']) / 2 +
     new_df['spend_upper'].fillna(new_df['spend_lower']) / 2) /
    (new_df['impressions_lower'].fillna(new_df['impressions_upper']) / 2 +
     new_df['impressions_upper'].fillna(new_df['impressions_lower']) / 2)
)
ads_dataframe

TRAIN ML MODEL

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import PowerTransformer
import nltk
nltk.download('punkt')

X = ads_dataframe['ad_creative_bodies'].astype(str)
y = ads_dataframe['cpi']

# Tokenize your text data
tokenized_text = X.apply(word_tokenize)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

pt_y = PowerTransformer(method='box-cox')
y_transformed = pt_y.fit_transform(y.values.reshape(-1, 1)).flatten()

X_train, X_test, y_train, y_test = train_test_split(tokenized_text, y_transformed, test_size=0.2, random_state=42)

# Function to average word vectors for a document
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.

    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)

    return feature_vector

# Function to generate document vectors using average word vectors
def generate_doc_vectors(docs, model, num_features):
    doc_vectors = [average_word_vectors(doc, model, model.wv.index_to_key, num_features) for doc in docs]
    return np.array(doc_vectors)

# Generate document vectors for training and testing sets
X_train_word2vec = generate_doc_vectors(X_train, word2vec_model, 100)
X_test_word2vec = generate_doc_vectors(X_test, word2vec_model, 100)

gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_regressor.fit(X_train_word2vec, y_train)

predictions_transformed = gb_regressor.predict(X_test_word2vec)

predictions = pt_y.inverse_transform(predictions_transformed.reshape(-1, 1)).flatten()

mse_original = mean_squared_error(y_test, predictions)
r2_original = r2_score(y_test, predictions)

print(f"Mean Squared Error (Original): {mse_original}")
print(f"R-squared (Original): {r2_original}")