# Tratamento de dados 1

A partir de uma lista de pedalboards (`data/pedalboards-list.json`), tentaremos obter os pedalboards.

## Lista de pedalboards

In [113]:
import pandas as pd
pedalboards = pd.read_csv('data/pedalboard-info.csv', index_col='index').sort_index()

pedalboards.head()

Unnamed: 0_level_0,artist,date,has_audio,has_video,link,rating,title,total_downloads,uploader
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5299,many,2011-08-18,False,False,patches.php?mode=show&unit=G3&ID=5299,4.3,plexi-delay,2540,otter
5300,Boston - Smokin,2011-08-20,False,False,patches.php?mode=show&unit=G3&ID=5300,3.5,Tom Scholz tone,4427,Henky Backer
5301,,2011-08-22,False,False,patches.php?mode=show&unit=G3&ID=5301,1.0,Very Clean,1928,Jimis
5302,Battles,2011-08-22,False,False,patches.php?mode=show&unit=G3&ID=5302,1.0,Pitch Rising Delay,1088,Jimis
5303,2Tone,2011-08-22,False,False,patches.php?mode=show&unit=G3&ID=5303,1.0,2Tone,1265,Jimis


## Total de pedalboards na lista

In [114]:
len(pedalboards)

794

## Função para obter pedalboards

É esperado que o pedalboard tenha o formato **XML**.

Testaremos com o primeiro pedalboard da lista:

In [None]:
import requests
import xmltodict
import cgi

def read_pedalboard(index):
    response = requests.get('http://guitarpatches.com/download.php?unit=G3&mode=download&ID={}'.format(index))
    
    value, params = cgi.parse_header(response.headers['Content-Disposition'])
    filename = params['filename']
    
    return {
        'index': index,
        'filename': filename,
        'data': xmltodict.parse(response.content)
    }

In [122]:
index_1 = pedalboards.index[0]
pedalboard_g3v1 = read_pedalboard(index_1)

index_2 = pedalboards.index[-1]
pedalboard_g3v2 = read_pedalboard(index_2)

#index_1, pedalboard_g3v1
#index_2, pedalboard_g3v2

## Extrair informações importantes do modelo de dados

Inicialmente queremos somente o nome do pedalboard e o índice dos plugins de áudio utilizados neste pedalboard

In [None]:
def extract_data(pedalboard):
    index = pedalboard['index']
    extension = pedalboard['filename'].split('.')[-1]
    
    if extension == 'g3p':
        return extract_data_g3pn(index, pedalboard['data']['PatchData'], 3)
    elif extension == 'g3xp':
        return extract_data_g3pn(index, pedalboard['data']['PatchData'], 6)
    elif extension == 'g3xa':
        return extract_data_g3xa(index, pedalboard['data'])
    else:
        raise Exception('Unknown format: {}'.format(extension))

def extract_data_g3pn(index, pedalboard, total):
    data = [index, pedalboard['Name']]

    for index in range(0, total):
        data.append(pedalboard['Module{}'.format(index)]['Prm1'])
    
    return [data]

def extract_data_g3xa(index, pedalboards):
    function = lambda pedalboard: extract_data_g3pn(index, pedalboard, 6)[0]
    return list(map(function, pedalboards['PatchSet']['Patches']['PatchData']))

extract_data(index_1, pedalboard_g3v1) + extract_data(index_2, pedalboard_g3v2)

## Processar toda a lista de plugins

Processamentos todos os pedalboards listados. Serão geradas duas listas

 * `data`: Plugins processados
 * `errors`: [index, Causa do erro]

Erros podem ocorrer por:
 * Arquivos compactados.

In [39]:
data = []
errors = []

for i, index in enumerate(pedalboards.index):
    print(i, index, end='')

    try:
        pedalboard = read_pedalboard(index)
        data += extract_data(pedalboard)
        print()
    except Exception as e:
        errors.append((index, e))
        print('', e)
    

0 5299
1 5300
2 5301
3 5302
4 5303
5 5304
6 5307
7 5347
8 5350
9 5352
10 5355
11 5357
12 5358
13 5359
14 5360
15 5361
16 5367
17 5368
18 5371
19 5372
20 5375
21 5381
22 5398
23 5401
24 5402
25 5424
26 5429
27 5431
28 5435
29 5438
30 5453
31 5475
32 5476
33 5478
34 5481
35 5482
36 5485
37 5487
38 5655
39 5671
40 5695
41 5696
42 5703
43 5705
44 5730
45 5731
46 5732
47 5734
48 5735
49 5740
50 5743
51 5751
52 5756
53 5757
54 5760
55 5762
56 5763
57 5765
58 5768
59 5815
60 5832
61 5849
62 5853
63 5875
64 5877
65 5878
66 5891
67 5892
68 5893
69 5938
70 5939
71 5940
72 5991
73 6026
74 6070
75 6071
76 6072
77 6075
78 6076
79 6150
80 6168
81 6170
82 6176
83 6177
84 6209
85 6210
86 6230
87 6232
88 6233
89 6234
90 6259
91 6261
92 6262
93 6275
94 6276
95 6277
96 6278
97 6279
98 6280
99 6281
100 6287
101 6288
102 6289
103 6291
104 6292
105 6293
106 6341
107 6342
108 6344
109 6345
110 6350
111 6352
112 6361
113 6380
114 6381
115 6389
116 6413
117 6440
118 6451
119 6452
120 6453
121 6454
122 6455
123

In [57]:
len(data), len(errors)

(757, 37)

In [11]:
#import json
#json.dumps(data)

'[["plexi*dly*", "74", "102", "55"], ["HB*Boston3", "31", "102", "33"], ["Numb______", "81", "57", "100"], ["_Ethereal", "31", "109", "59"], ["Slash*****", "24", "72", "102"], ["ACDC******", "55", "102", "61"], ["DearPruden", "96", "42", "55"], ["PaganBaby*", "100", "64", "67"], ["WalkOnWate", "96", "64", "67"], ["RambleTamb", "100", "64", "67"], ["Clean", "24", "19", "31"], ["solo*4*b*", "31", "78", "17"], ["Money4Noth", "28", "103", "65"], ["Blvd*of*BD", "108", "77", "39"], ["Lightning*", "72", "40", "96"], ["WickedGame", "96", "64", "57"], ["AC-Solo", "86", "31", "67"], ["Djent*****", "81", "31", "45"], ["Shredding", "73", "103", "66"], ["REM*******", "56", "17", "19"], ["Rocking", "73", "106", "28"], ["barracuda*", "101", "61", "48"], ["Petrucci**", "104", "72", "31"], ["11Th*HOUR*", "104", "28", "17"], ["HB*eVh****", "47", "103", "57"], ["DZ*DRIVE**", "105", "28", "62"], ["JIMI*FUZZ*", "79", "2", "65"], ["Slash*****", "31", "103", "19"], ["HBChevelle", "28", "104", "32"], ["Foo*Fi

In [130]:
errors

[(6776,
  xml.parsers.expat.ExpatError('not well-formed (invalid token): line 1, column 2')),
 (6847,
  xml.parsers.expat.ExpatError('not well-formed (invalid token): line 1, column 2')),
 (6903,
  xml.parsers.expat.ExpatError('not well-formed (invalid token): line 1, column 2')),
 (7074,
  xml.parsers.expat.ExpatError('not well-formed (invalid token): line 1, column 3')),
 (7082,
  xml.parsers.expat.ExpatError('not well-formed (invalid token): line 1, column 2')),
 (7083,
  xml.parsers.expat.ExpatError('not well-formed (invalid token): line 1, column 2')),
 (7313,
  xml.parsers.expat.ExpatError('not well-formed (invalid token): line 1, column 2')),
 (7314,
  xml.parsers.expat.ExpatError('not well-formed (invalid token): line 1, column 2')),
 (7315,
  xml.parsers.expat.ExpatError('not well-formed (invalid token): line 1, column 2')),
 (7316,
  xml.parsers.expat.ExpatError('not well-formed (invalid token): line 1, column 2')),
 (7645, KeyError('PatchData')),
 (7646, KeyError('PatchData'

In [131]:
example = requests.get('http://guitarpatches.com/download.php?unit=G3&mode=download&ID={}'.format(7313))

In [152]:
import zipfile
import io

zipped = zipfile.ZipFile(io.BytesIO(example.content))

for zipped_file in zipped.filelist:
    print(zipped.getinfo(zipped.filelist[0].filename))

<ZipInfo filename='Patches G5 Bank1.g3xa' compress_type=deflate external_attr=0x20 file_size=259689 compress_size=8628>
