In [1]:
import pandas as pd 
import glob
import re

from webhoseQueries import *

In [4]:
path ='../similarsites_urls_data'

allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
frame = pd.concat(list_).reset_index(drop=True)

# url list

In [7]:
# rename columns correctly
frame.columns = ['rank', 'website', 'category', 'change', 'avg_visit_duration', 'pages/visit', 'bounce_rate']

# clean domain 
frame['website'] = frame['website'].map(lambda x: re.search('([a-z0-9]*\.)+[a-z0-9]*',x).group(0))

# split categories and subcategories
frame[['category', 'subcategory']] = pd.DataFrame(frame.category.str.split(' > ').tolist(), columns = ['category','subcatergory'])

# percentage string to float/100
frame['bounce_rate'] = frame['bounce_rate'].map(lambda x: float(str(x).strip('%'))/100)

# datetime for duration ???


# drop duplicate, for now they have the same categories so it's ok to do so
frame = frame.drop_duplicates(['website'])

In [4]:
# to check if we have duplicate sites with different categories
frame[frame.duplicated(['website'], keep=False) & ~ frame.duplicated(['category'],keep=False)].sort_values('website')

Unnamed: 0,rank,website,category,change,avg_visit_duration,pages/visit,bounce_rate,subcategory


In [5]:
# load csv articles obtained with this list of websites
df_articles = pd.DataFrame.from_csv('../Datasets/articles.csv', sep='\t', encoding='utf-8')

  


In [8]:
# join the 2 dataframes together
result = pd.merge(df_articles, frame, left_on = 'site', right_on = 'website')

In [7]:
result.groupby(result.category).count()

Unnamed: 0_level_0,site,site_type,site_section,site_categories,domain_rank,country,author,published,title,text,...,persons_neu,persons_neg,rank,website,category,change,avg_visit_duration,pages/visit,bounce_rate,subcategory
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Food and Drink,992,992,777,992,840,992,736,992,938,992,...,992,992,992,992,992,992,992,992,992,927
Health,1847,1847,1384,1847,1721,1845,1181,1847,1577,1847,...,1847,1847,1847,1847,1847,1847,1847,1847,1847,1073
News and Media,7813,7813,6743,7741,7559,7700,5330,7741,6070,7740,...,7741,7741,7813,7813,7813,7813,7813,7813,7813,5612
Science,1569,1569,1355,1541,1540,1492,952,1541,1538,1541,...,1541,1541,1569,1569,1569,1569,1569,1569,1569,735


In [9]:
result.groupby(result.subcategory).count()

Unnamed: 0_level_0,site,site_type,site_section,site_categories,domain_rank,country,author,published,title,text,...,persons_neu,persons_neg,rank,website,category,change,avg_visit_duration,pages/visit,bounce_rate,subcategory
subcategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Biology,266,266,238,266,265,265,233,266,266,266,...,266,266,266,266,266,266,266,266,266,266
Chemistry,216,216,215,216,216,216,216,216,216,216,...,216,216,216,216,216,216,216,216,216,216
Child Health,1756,1756,1756,1756,1756,1756,1740,1756,266,1756,...,1756,1756,1756,1756,1756,1756,1756,1756,1756,1756
Conditions and Diseases,100,100,76,100,100,98,73,100,98,100,...,100,100,100,100,100,100,100,100,100,100
Cooking and Recipes,1022,1022,937,1022,922,1022,938,1022,601,1022,...,1022,1022,1022,1022,1022,1022,1022,1022,1022,1022
Education and Resources,100,100,100,100,100,100,0,100,100,100,...,100,100,100,100,100,100,100,100,100,100
Environment,119,119,116,119,119,119,89,119,119,119,...,119,119,119,119,119,119,119,119,119,119
Food and Grocery Retailers,11,11,0,11,11,11,0,11,11,11,...,11,11,11,11,11,11,11,11,11,11
Healthcare Industry,1,1,0,1,1,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Magazines and E Zines,172,172,160,172,172,172,123,172,81,172,...,172,172,172,172,172,172,172,172,172,172


In [9]:
# filter by country that have most of their articles in english (derived manually)

english_country = ['FR', 'US', 'CH', 'EU', 'GB',
                   'CA', 'PW', 'ID', 'ES', 'LI',
                   'IL', 'IN', 'VN', 'NL', 'TW', 
                   'HR', 'KS', 'HK', 'KR', 'SG',
                   'AU', 'DK', 'NO']

result_english = result[result.country.isin(english_country)]

In [10]:
# save the frame 
frame.to_csv('../Datasets/urls_list_with_labels.csv', sep='\t', encoding='utf-8')
result_english.to_csv('../Datasets/articles_filter_en.csv', sep='\t', encoding='utf-8')

# articles query
This part creates all jsons with articles in SciNewsClassification\webhose_data
DO NOT LAUNCH IT, BECAUSE IT WILL USE ALL QUERIES FROM WEBHOSE !!!!

In [11]:
def fetch_all_articles():
    for domain in frame.website:
        print(domain)
        get_pages_into_json(domain)

In [None]:
get_pages_into_json('', 1000)

# TEST

In [12]:
import newspaper
def articles_from_domain(domain):
    papers = newspaper.build(domain, language='en')
    return papers.articles
    

In [13]:
for domain in frame.website:
    print(domain)
    papers = newspaper.build('http://www.veggieboards.com/', language='en')
    print(papers.size())


happycow.net
0
irrawaddy.com
0
veganricha.com
0
froh.de
0
veggieboards.com
0
rawdd.com
0
stadionowioprawcy.net
0
vegansociety.com
0
vebu.de
0
chooseveg.com
0
farmfreshtoyou.com
0
se.com.br
0
holycowvegan.net
0
vegnews.com
0
msrawytop.com
0
vegetarisch.de
0
veganuary.com
0
vegan.com
0
ilovevegan.com
0
vegetarian.ru
0
vegancoach.com
0
gesund.de
0
theflamingvegan.com
0
suiis.com
0
biserawalpindi.edu.pk
0
vegetarianismo.net
0
brandnewvegan.com
0
vegweb.com
0
veganessentials.com
0
cntvan.com
0
veggiecommunity.org
0
vrg.org
0
followyourheart.com
0
mania.com
0
veganbodybuilding.com
0
raws.org
0
sunwarrior.com
0
cantinhovegetariano.com.br
0
veganfoodlover.com
0
veganstart.de
0
rawan.net
0
job.lzu.edu.cn
0
deutschlandistvegan.de
0
vegancuts.com
0
vegsoc.org
0
veganguerilla.de
0
vegpool.de
0
vegfest.co.uk
0
veg.by
0
cookpad.com
0
tabelog.com
0
hotpepper.jp
0
tesco.com
0
dianping.com
0
asda.com
0
allrecipes.com
0
zomato.com
0
chefkoch.de
0
pizzahut.com
0
dominos.com
0
tudogostoso.com.br
0
gnavi.c