In [1]:
from mobile import get_brands
import os
import glob
import pandas as pd

In [2]:
all_brands = get_brands()

In [3]:
all_brands.head()

Unnamed: 0,url,brand,n_models
0,https://www.gsmarena.com/acer-phones-59.php,Acer,100
1,https://www.gsmarena.com/alcatel-phones-5.php,alcatel,378
2,https://www.gsmarena.com/allview-phones-88.php,Allview,134
3,https://www.gsmarena.com/amazon-phones-76.php,Amazon,17
4,https://www.gsmarena.com/amoi-phones-28.php,Amoi,47


In [4]:
expected_n_models = all_brands.n_models.sum()
n_brands = all_brands.shape[0]
print (expected_n_models, n_brands)

9505 110


In [5]:
if os.path.isdir('model_attributes'):
    print ('Results exist!')

Results exist!


In [68]:
all_results = glob.glob('model_attributes/*.json')
 # number of results matches brands
assert len(all_results) == n_brands, "Wrong number of brands"

In [69]:
from pandas.io.json import json_normalize
import json

test_fn = 'model_attributes/Acer_models.json'
io = open(test_fn, 'r')
to_json = [json.loads(line) for line in io] # new line per record
test_df = json_normalize(to_json)
# check acer brands
(test_df.shape, all_brands.loc[all_brands.brand == 'Acer'].n_models.values[0]) # matching n.models

((100, 195), 100)

# Combining all data

In [35]:
from functools import reduce

def read_fp(fn): return [json.loads(line) for line in open(fn, 'r') if line]

all_records = list(map(read_fp, all_results))

# Mistakenly hardcoded brand name, stretching to work with zip, making amendments on fly
brand_names = [x.split('/')[-1].replace('_models.json','') for x in all_results]
stretched_brands = [[b] * len(objs) for b,objs in zip(brand_names, all_records)]
stretched_brands = [x for y in stretched_brands for x in y]

all_records = reduce(lambda x,y: x+y, all_records) # combining all lists
all_records = [{**r, **{'brand' : brand}} for r,brand in zip(all_records, stretched_brands)]

In [70]:
normalized = json_normalize(all_records) # flattens dictionaries into df

In [72]:
normalized.loc[normalized.err.notnull()].groupby(['err','brand']).size() # failed requests

err                                                  brand  
<urlopen error [Errno 54] Connection reset by peer>  O          1
                                                     OnePlus    1
                                                     Oppo       2
                                                     Orange     2
                                                     Samsung    1
Remote end closed connection without response        Oppo       1
dtype: int64

In [48]:
results_df = normalized.groupby('brand').size().rename('results_n').reset_index()
final_df = pd.merge(results_df, all_brands, on='brand')[['brand', 'url', 'results_n', 'n_models']]

In [74]:
final_df

Unnamed: 0,brand,url,results_n,n_models
0,AT&T;,https://www.gsmarena.com/at&t-phones-57.php,4,4
1,Acer,https://www.gsmarena.com/acer-phones-59.php,100,100
2,Allview,https://www.gsmarena.com/allview-phones-88.php,134,134
3,Amazon,https://www.gsmarena.com/amazon-phones-76.php,17,17
4,Amoi,https://www.gsmarena.com/amoi-phones-28.php,47,47
5,Apple,https://www.gsmarena.com/apple-phones-48.php,63,63
6,Archos,https://www.gsmarena.com/archos-phones-90.php,39,39
7,Asus,https://www.gsmarena.com/asus-phones-46.php,169,169
8,BLU,https://www.gsmarena.com/blu-phones-67.php,279,279
9,BQ,https://www.gsmarena.com/bq-phones-108.php,20,20


In [75]:
final_df.head()
print (final_df.shape) # all brands accounted for

(110, 4)


# Missing models

In [76]:
final_df.columns

Index(['brand', 'url', 'results_n', 'n_models'], dtype='object')

In [77]:
# fail safe for additional models added, must be something wrong with pages
mask = final_df['results_n'] != final_df['n_models']
final_df.where(mask).dropna(axis=0, how='all')

Unnamed: 0,brand,url,results_n,n_models
35,Huawei,https://www.gsmarena.com/huawei-phones-58.php,288.0,289.0
45,Lenovo,https://www.gsmarena.com/lenovo-phones-73.php,180.0,181.0
55,Motorola,https://www.gsmarena.com/motorola-phones-4.php,459.0,461.0
59,Nokia,https://www.gsmarena.com/nokia-phones-1.php,465.0,467.0
61,O,https://www.gsmarena.com/o2-phones-30.php,45.0,245.0
62,OnePlus,https://www.gsmarena.com/oneplus-phones-95.php,8.0,9.0
65,Palm,https://www.gsmarena.com/palm-phones-27.php,16.0,17.0
77,Samsung,https://www.gsmarena.com/samsung-phones-9.php,1155.0,1157.0
83,Sony,https://www.gsmarena.com/sony-phones-7.php,136.0,137.0
98,XOLO,https://www.gsmarena.com/xolo-phones-85.php,80.0,81.0


# Completed

In [79]:
final_df.where(~mask).dropna(axis=0, how = 'all')

Unnamed: 0,brand,url,results_n,n_models
0,AT&T;,https://www.gsmarena.com/at&t-phones-57.php,4.0,4.0
1,Acer,https://www.gsmarena.com/acer-phones-59.php,100.0,100.0
2,Allview,https://www.gsmarena.com/allview-phones-88.php,134.0,134.0
3,Amazon,https://www.gsmarena.com/amazon-phones-76.php,17.0,17.0
4,Amoi,https://www.gsmarena.com/amoi-phones-28.php,47.0,47.0
5,Apple,https://www.gsmarena.com/apple-phones-48.php,63.0,63.0
6,Archos,https://www.gsmarena.com/archos-phones-90.php,39.0,39.0
7,Asus,https://www.gsmarena.com/asus-phones-46.php,169.0,169.0
8,BLU,https://www.gsmarena.com/blu-phones-67.php,279.0,279.0
9,BQ,https://www.gsmarena.com/bq-phones-108.php,20.0,20.0
