# Imports 

In [44]:
import pandas as pd

import urlexpander

from ast import literal_eval

import json

from tqdm import tqdm
tqdm.pandas()

# Settings and Configuration 

In [69]:
SRC_DATA = '../../data/webpage_data/full_flipside_data.csv'
DATA_OUT = '../../data/webpage_data/full_flipside_data_clean.csv'
URL_MAP_FP = '../../data/webpage_data/url_map.json'

URL_BUFF_LEN = 10 # Number of urls to include in expansion buffer (in case of resolution errors)
MAX_URLS     = 5

# Data Loading and Cleaning

In [5]:
data = pd.read_csv(SRC_DATA)

In [10]:
data['linked_arts'] = data['linked_arts'].str.replace('\n', ', ', regex = False)
data['linked_arts'] = data['linked_arts'].apply(literal_eval)

In [12]:
data['top_arts'] = data['linked_arts'].apply(lambda l: l[:URL_BUFF_LEN])

# URL Resolution 

Determine which urls need to be expanded and create a mapping

In [25]:
def should_expand(url):
    return 'theflipside.us15.list-manage.com' in url or urlexpander.is_short(url)

In [27]:
to_res_urls = []

In [28]:
all_urls = [url for url_list in data['top_arts'].values for url in url_list]

In [29]:
for url in all_urls:
    if should_expand(url):
        to_res_urls.append(url)

In [31]:
res_urls = urlexpander.expand(to_res_urls)

In [34]:
url_map = {url:res_url for url, res_url in zip(to_res_urls, res_urls)}

In [35]:
with open(URL_MAP_FP, 'w') as f:
    json.dump(url_map, f)

# URL Mapping 

In [62]:
def map_urls(urls, url_map):
    mapped_urls = []
    for url in urls:
        if url in url_map.keys():
            if '_ERROR_' not in url_map[url]:
                mapped_urls.append(url_map[url])
        else:
            mapped_urls.append(url)
        
    return mapped_urls

In [63]:
data['mapped_urls'] = data['top_arts'].progress_apply(lambda url: map_urls(url, url_map))

100%|████████████████████████████████████████████████████████████████████████████| 977/977 [00:00<00:00, 108577.81it/s]


In [65]:
data['linked_arts_clean'] = data['mapped_urls'].apply(lambda l: l[:5])

In [67]:
data = data.drop(['top_arts', 'mapped_urls'], axis = 1)

In [70]:
data.to_csv(DATA_OUT, index = None)