In [None]:
import ijson
from tqdm import tqdm
import pandas as pd
import os
import json
import numpy as np
import ast


ads_dataset_path = '/content/drive/MyDrive/CS491MLMODEL/us/us/ads.json'
ads_downsized_path = '/content/drive/MyDrive/CS491MLMODEL/us/us/ads_downsized.json'
ads_non_duplicate_path = '/content/drive/MyDrive/CS491MLMODEL/us/us/ads_non_duplicate.json'
ads_dataframe = '/content/drive/MyDrive/CS491MLMODEL/us/us/ads_dataframe.csv'
ads_non_duplicate_path = '/content/drive/MyDrive/CS491MLMODEL/us/us/ads_non_duplicate.json'
ads_dataframe_age_filtered = '/content/drive/MyDrive/CS491MLMODEL/us/us/ads_dataframe_age_filtered.csv' 

*DATA* DOWNSIZING

In [None]:
json_item_heuristic = 3000000

def downsize_data(num_of_items, input_dataset_path, output_dataset_path):
    counter = 0
    with open(input_dataset_path, 'r') as input_file:
        parser = ijson.items(input_file, 'item')

        with open(output_dataset_path, 'a') as outfile:  # Open output file once
            outfile.write('[\n')
            # Use tqdm to create a progress bar
            for item in tqdm(parser, total=int(num_of_items), unit=" items"):
                if counter > num_of_items:
                    break
                counter += 1
                json.dump(item, outfile)
                if counter > num_of_items:
                  outfile.write('\n')  # Add a newline between items
                else:
                  outfile.write(',\n')  # Add a newline between items

            outfile.write(']\n')
            outfile.close()

downsize_data(json_item_heuristic, ads_dataset_path, ads_downsized_path)

DATA DUPLICATE REMOVAL

In [None]:
def remove_duplicates(filename, column, output_filename):
    json_item_heuristic = 3000
    first_item = True
    removed_content = 0
    unique_values = set()

    with open(filename, 'rb') as file:
        items = ijson.items(file, 'item')

        with open(output_filename, 'w') as outfile:
            outfile.write('[\n')
            for item in tqdm(items, total=int(json_item_heuristic), unit=" items"):
                if 'ad_creative_bodies' in item:
                    column_value = tuple(item['ad_creative_bodies'])

                    if (column_value not in unique_values) and (len(column_value) == 1):
                        unique_values.add(column_value)
                        if first_item:
                            json.dump(item, outfile)
                            first_item = False
                        else:
                            outfile.write(',\n')
                            json.dump(item, outfile)
                    else:
                        removed_content += 1
                else:
                    removed_content += 1
                    pass
            outfile.write(']\n')
    print(f"Amount of ads that have been filtered: {removed_content}")


remove_duplicates(ads_downsized_path, 'a', ads_non_duplicate_path)

JSON TO CSV CONVERSION

In [None]:
with open(ads_non_duplicate_path, 'r') as file:
    data = json.load(file)

processed_data = []
for item in data:
    selected_attributes = {'ad_creation_time': item['ad_creation_time'],
                           'ad_creative_bodies': item['ad_creative_bodies'],
                           'currency': item['currency'],
                           'impressions': item['impressions'],
                           'spend': item['spend']}
    processed_data.append(selected_attributes)

df = pd.DataFrame(processed_data)

df.to_csv(ads_dataframe, index=False)

LOAD DATAFRAME & GET INFORMATION

In [None]:
df = pd.read_csv(ads_dataframe)
print(df.dtypes)
df.head()

CONVERT INTO DATAFRAME

In [None]:
new_df = pd.DataFrame()

new_df['ad_creation_time'] = pd.to_datetime(df['ad_creation_time'])
new_df['ad_creative_bodies'] = df['ad_creative_bodies'].apply(lambda x: ast.literal_eval(x)[0] if pd.notnull(x) else x).astype(str)


# Extract 'lower_bound' and 'upper_bound' values if they exist, otherwise use NaN
new_df['impressions_lower'] = df['impressions'].apply(lambda x: ast.literal_eval(x).get('lower_bound', None)).astype("float")
new_df['impressions_upper'] = df['impressions'].apply(lambda x: ast.literal_eval(x).get('upper_bound', None)).astype("float")

new_df['spend_lower'] = df['spend'].apply(lambda x: ast.literal_eval(x).get('lower_bound', None)).astype("float")
new_df['spend_upper'] = df['spend'].apply(lambda x: ast.literal_eval(x).get('upper_bound', None)).astype("float")

print(new_df.dtypes)

new_df.head()

MERGE COLUMNS AND SORT ACCORDING TO DATE

In [None]:
ads_dataframe = pd.DataFrame()
ads_dataframe['ad_creation_time'] = new_df['ad_creation_time']
ads_dataframe['ad_creative_bodies'] = new_df['ad_creative_bodies']
ads_dataframe['cpi'] = (
    (new_df['spend_lower'].fillna(new_df['spend_upper']) / 2 +
     new_df['spend_upper'].fillna(new_df['spend_lower']) / 2) /
    (new_df['impressions_lower'].fillna(new_df['impressions_upper']) / 2 +
     new_df['impressions_upper'].fillna(new_df['impressions_lower']) / 2)
)

ads_dataframe = ads_dataframe.sort_values(by='ad_creation_time')
ads_dataframe