In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import os, re

In [44]:
docs = []
dir = './Documents/Encoded'
for file in os.listdir(dir):
    print('Reading', file)
    df = pd.read_csv(os.path.join(dir,file), index_col=0)
    docs.append(df)
print('Done - %d documents'%len(docs))

Reading Antiques_and_Collectibles_with_text.csv
Reading Auctions_with_text.csv
Reading Children_with_text.csv
Reading Classifieds_with_text.csv
Reading Clothing_with_text.csv
Reading Consumer_Electronics_with_text.csv
Reading Crafts_with_text.csv
Reading Death_Care_with_text.csv
Reading Entertainment_with_text.csv
Reading Ethnic_and_Regional_with_text.csv
Reading Flowers_with_text.csv
Reading food_with_text.csv
Reading General_Merchandise_with_text.csv
Reading Gifts_with_text.csv
Reading Health_with_text.csv
Reading Holidays_with_text.csv
Reading Home_and_Garden_with_text.csv
Reading Jewelry_with_text.csv
Reading Music_with_text.csv
Reading Niche_with_text.csv
Reading Office_Products_with_text.csv
Reading Pets_with_text.csv
Reading Photography_with_text.csv
Reading Recreation_with_text.csv
Reading Sports_with_text.csv
Reading Tobacco_with_text.csv
Reading Tools_with_text.csv
Reading Travel_with_text.csv
Reading Vehicles_with_text.csv
Reading Visual_Arts_with_text.csv
Reading Weddings_w

In [45]:
data_all = pd.concat(docs)
data_all.head()

Unnamed: 0,url,label,http,text
0,retroplanet.com,Antiques and Collectibles/1950s Memorabilia,http://retroplanet.com,"Retro Planet ~ Tin Signs, Retro Decor, Diner F..."
1,hepcat.se,Antiques and Collectibles/1950s Memorabilia,http://hepcat.se,"HepCat Store - Clothing, shoes, hats and acces..."
2,barkclothhawaii.com,Antiques and Collectibles/1950s Memorabilia,http://barkclothhawaii.com,"Barkcloth Hawaii ""Vintage"" Tropical Hawaiian F..."
3,munktiki.com,Antiques and Collectibles/1950s Memorabilia,http://munktiki.com,MUNKTIKI Munktiki Munktiki now has two shoppin...
4,retrowonders.com,Antiques and Collectibles/1950s Memorabilia,http://retrowonders.com,"RetroWonders-Retro Jukeboxes, Radios, Record P..."


In [46]:
data_all.describe(include='all')[:4]

Unnamed: 0,url,label,http,text
count,44427,44427,44427,43555
unique,44327,3603,44327,40911
top,llbean.com,Jewelry/Body/Piercing,http://llbean.com,error
freq,3,50,3,1181


# Some Cleaning

In [47]:
threshold = 100
data_all = data_all[~data_all.text.isnull()]
data_all = data_all[data_all.text.str.len() > threshold]  # Page errors
data_all.describe(include='all')[:4]

Unnamed: 0,url,label,http,text
count,39542,39542,39542,39542
unique,39451,3548,39451,38906
top,llbean.com,Flowers/Florists/North America/United States/C...,http://llbean.com,Create an Ecommerce Website and Sell Online! E...
freq,3,50,3,48


In [48]:
data_all.drop_duplicates(subset=['text'], keep=False, inplace=True)
data_all.drop_duplicates(subset=['url'], keep='first', inplace=True)
data_all.describe(include='all')[:4]

Unnamed: 0,url,label,http,text
count,38662,38662,38662,38662
unique,38662,3538,38662,38662
top,addwarehouse.com,Gifts/Personalized/Trophies,http://dadant.com,Dave Wright Caricatures e:info@davewrightcarto...
freq,1,50,1,1


In [49]:
patterns = ['^30[0-9]', '^4[0-9][0-9]', '^5[0-9][0-9]']
for pattern in patterns:
    data_all = data_all[~data_all.text.str.match(pattern)]
data_all.describe(include='all')[:4]

Unnamed: 0,url,label,http,text
count,38554,38554,38554,38554
unique,38554,3538,38554,38554
top,addwarehouse.com,Flowers/Florists/North America/United States/C...,http://dadant.com,Dave Wright Caricatures e:info@davewrightcarto...
freq,1,50,1,1


In [50]:
error_keys = ['vB_Database_MySQLi', '404 error', '404 Error', '404 ERROR', '404 Not Found', '404 File not',
              'Error 404', 'error 404', 'we are closed', '.php could not be found', 'website is for sale', 'We are closed',
             'is expired', 'you are the owner of this domain', 'Closed for maintenance', 'closed for maintenance', 'Error 503',
             'website is under renovation', 'redesigning our site', 'this version of Internet Explorer', 'temporarily offline',
             'temporarily unavailable', 'Web Page is Unavailable', 'Website Maintenance', 'the previous website', 'We have moved',
             'browser does not have JavaScript', 'JavaScript disabled', 'Access denied', 'browser is not supported', 'Just another WordPress',
             'Welcome to WordPress', 'Hello World', 'File Not Found', 'File not found', 'DNS resolution error', 'Site Unavailable',
             'error occurred', 'featured domains', 'This domain is for sale', 'Oops, something is wrong', 'Website is Unavailable',
             'Web server is down', 'SITE ACTIVATION', 'Free Web Hosting', 'Website Is Unavailable', 'Domain Name Is Expired',
             'website hosted by WebFaction', 'turn JavaScript on', 'vB_Database_MySQLi', 'Error\s[0-9]+','[0-9]+\sError',
             '30[0-9]:', '4[0-9][0-9]:', '5[0-9][0-9]\:', 'you must be [0-9]{2} or older', 'Untitled', 'ERROR\s[0-9]+', 'Page Not Found'
             'Page not found', 'Not Found', '\.[a-z]{2,3} is available for sale']
for key in error_keys:
    data_all = data_all[~data_all.text.str.contains(key)]
data_all.describe(include='all')[:4]

Unnamed: 0,url,label,http,text
count,38003,38003,38003,38003
unique,38003,3529,38003,38003
top,addwarehouse.com,Gifts/Personalized/Trophies,http://dadant.com,Dave Wright Caricatures e:info@davewrightcarto...
freq,1,50,1,1


In [51]:
blacklist = ['JavaScript seems to be disabled in your browser',
             'You must have JavaScript enabled in your browser',
            'For the best experience on our site, be sure to turn on Javascript in your browser',
            'You must have JavaScript enabled in your browser to utilize the functionality of this website']
for s in blacklist:
    data_all.text = data_all.text.str.replace(s, '')

In [52]:
data_all.to_csv('./data_clean.csv', index=False)

In [53]:
# url = ''
# pd.set_option('display.max_colwidth', -1)
# print(data_all[data_all.url == url].text)
# data_all[data_all.url == url].text.str.contains('Error [0-9]+')

In [54]:
removed = []
urls = data_all.url.unique()
for idx, row in pd.concat(docs).iterrows():
    if row.url not in urls:
        removed.append(row.url)
df = pd.DataFrame(removed)
df.to_csv('removed.csv')

print(len(removed))
with open('removed.json', 'w') as file:
    file.write('{"removed_sites" : [')
    file.write(','.join(['"'+v.values[0]+'"' for idx, v in df.iterrows()]))
    file.write(']}')

6391
