# Operace nad daty

## Datová sada Auta

### Raw (surová data)

In [20]:
import aiohttp
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

async with aiohttp.ClientSession() as session:
    async with session.get(url) as resp:
        # print(resp.status)
        textresponse = await resp.text()
print(textresponse[:1000])

18.0   8   307.0      130.0      3504.      12.0   70  1	"chevrolet chevelle malibu"
15.0   8   350.0      165.0      3693.      11.5   70  1	"buick skylark 320"
18.0   8   318.0      150.0      3436.      11.0   70  1	"plymouth satellite"
16.0   8   304.0      150.0      3433.      12.0   70  1	"amc rebel sst"
17.0   8   302.0      140.0      3449.      10.5   70  1	"ford torino"
15.0   8   429.0      198.0      4341.      10.0   70  1	"ford galaxie 500"
14.0   8   454.0      220.0      4354.       9.0   70  1	"chevrolet impala"
14.0   8   440.0      215.0      4312.       8.5   70  1	"plymouth fury iii"
14.0   8   455.0      225.0      4425.      10.0   70  1	"pontiac catalina"
15.0   8   390.0      190.0      3850.       8.5   70  1	"amc ambassador dpl"
15.0   8   383.0      170.0      3563.      10.0   70  1	"dodge challenger se"
14.0   8   340.0      160.0      3609.       8.0   70  1	"plymouth 'cuda 340"
15.0   8   400.0      150.0      3761.       9.5   70  1	"chevrolet monte ca

### Reformat

In [21]:
import re
textresponse = re.sub(' +', ' ', textresponse)
textresponse = re.sub('\t', ' ', textresponse)
print(textresponse[:1000])

18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu"
15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320"
18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite"
16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst"
17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino"
15.0 8 429.0 198.0 4341. 10.0 70 1 "ford galaxie 500"
14.0 8 454.0 220.0 4354. 9.0 70 1 "chevrolet impala"
14.0 8 440.0 215.0 4312. 8.5 70 1 "plymouth fury iii"
14.0 8 455.0 225.0 4425. 10.0 70 1 "pontiac catalina"
15.0 8 390.0 190.0 3850. 8.5 70 1 "amc ambassador dpl"
15.0 8 383.0 170.0 3563. 10.0 70 1 "dodge challenger se"
14.0 8 340.0 160.0 3609. 8.0 70 1 "plymouth 'cuda 340"
15.0 8 400.0 150.0 3761. 9.5 70 1 "chevrolet monte carlo"
14.0 8 455.0 225.0 3086. 10.0 70 1 "buick estate wagon (sw)"
24.0 4 113.0 95.00 2372. 15.0 70 3 "toyota corona mark ii"
22.0 6 198.0 95.00 2833. 15.5 70 1 "plymouth duster"
18.0 6 199.0 97.00 2774. 15.5 70 1 "amc hornet"
21.0 6 200.0 85.00 2587. 16.0 70 1 "ford maverick"
27.0 4 97.00 8

### Načtení do Pandas

In [22]:
import pandas as pd
from io import StringIO 

pd.set_option("display.max_columns", 7)
df = pd.read_csv(StringIO(textresponse), sep=" ", names=["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin", "name"])
df

Unnamed: 0,mpg,cylinders,displacement,...,year,origin,name
0,18.0,8,307.0,...,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,...,70,1,buick skylark 320
2,18.0,8,318.0,...,70,1,plymouth satellite
3,16.0,8,304.0,...,70,1,amc rebel sst
4,17.0,8,302.0,...,70,1,ford torino
...,...,...,...,...,...,...,...
393,27.0,4,140.0,...,82,1,ford mustang gl
394,44.0,4,97.0,...,82,2,vw pickup
395,32.0,4,135.0,...,82,1,dodge rampage
396,28.0,4,120.0,...,82,1,ford ranger


## Práce s indexem

In [4]:
df.reset_index(inplace=True, drop=True)
display(df)

Unnamed: 0,mpg,cylinders,displacement,...,year,origin,name
0,18.0,8,307.0,...,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,...,70,1,buick skylark 320
2,18.0,8,318.0,...,70,1,plymouth satellite
3,16.0,8,304.0,...,70,1,amc rebel sst
4,17.0,8,302.0,...,70,1,ford torino
...,...,...,...,...,...,...,...
393,27.0,4,140.0,...,82,1,ford mustang gl
394,44.0,4,97.0,...,82,2,vw pickup
395,32.0,4,135.0,...,82,1,dodge rampage
396,28.0,4,120.0,...,82,1,ford ranger


### Sort a Index

In [5]:
df = df.sort_values(by="name", ascending=True)
print(f"The first car is: {df['name'].iloc[0]}")

display(df)

The first car is: amc ambassador brougham


Unnamed: 0,mpg,cylinders,displacement,...,year,origin,name
96,13.0,8,360.0,...,73,1,amc ambassador brougham
9,15.0,8,390.0,...,70,1,amc ambassador dpl
66,17.0,8,304.0,...,72,1,amc ambassador sst
315,24.3,4,151.0,...,80,1,amc concord
257,19.4,6,232.0,...,78,1,amc concord
...,...,...,...,...,...,...,...
394,44.0,4,97.0,...,82,2,vw pickup
309,41.5,4,98.0,...,80,2,vw rabbit
197,29.0,4,90.0,...,76,2,vw rabbit
325,44.3,4,90.0,...,80,2,vw rabbit c (diesel)


In [6]:
df.reset_index(inplace=True, drop=True)
display(df)

Unnamed: 0,mpg,cylinders,displacement,...,year,origin,name
0,13.0,8,360.0,...,73,1,amc ambassador brougham
1,15.0,8,390.0,...,70,1,amc ambassador dpl
2,17.0,8,304.0,...,72,1,amc ambassador sst
3,24.3,4,151.0,...,80,1,amc concord
4,19.4,6,232.0,...,78,1,amc concord
...,...,...,...,...,...,...,...
393,44.0,4,97.0,...,82,2,vw pickup
394,41.5,4,98.0,...,80,2,vw rabbit
395,29.0,4,90.0,...,76,2,vw rabbit
396,44.3,4,90.0,...,80,2,vw rabbit c (diesel)


Náhodné přeskupení

Náhodné přeskupení má význam při učení neuronových sítí. Na pořadí dat nezáleží, lze je tedy libovolně přeskupit.

In [14]:
import numpy as np
df = df.reindex(np.random.permutation(df.index))
df

Unnamed: 0,mpg,cylinders,displacement,...,year,origin,name
92,18.0,6,250.0,...,75,1,chevrolet nova
221,32.4,4,107.0,...,80,3,honda accord
65,27.0,4,112.0,...,82,1,chevrolet cavalier wagon
264,23.9,8,260.0,...,79,1,oldsmobile cutlass salon brougham
354,26.0,4,97.0,...,77,3,toyota corolla liftback
...,...,...,...,...,...,...,...
269,23.8,4,151.0,...,78,1,oldsmobile starfire sx
211,23.0,4,140.0,...,75,1,ford pinto
69,17.5,8,305.0,...,76,1,chevrolet chevelle malibu classic
199,21.0,6,200.0,...,70,1,ford maverick


### Seskupování a agregace

In [8]:
g = df.groupby("cylinders")["mpg"].mean()
d = g.to_dict()
d

{3: 20.55,
 4: 29.28676470588235,
 5: 27.366666666666664,
 6: 19.985714285714284,
 8: 14.963106796116506}

Připomínka přístupu k prvkům dictionary (slovníku).

In [9]:
d[8]

14.963106796116506

### Vícenásobná agregace

In [12]:
g = df.groupby("cylinders")["mpg"].agg(['mean', 'count', 'sum'])
d = g.to_dict()
d

{'mean': {3: 20.55,
  4: 29.28676470588235,
  5: 27.366666666666664,
  6: 19.985714285714284,
  8: 14.963106796116506},
 'count': {3: 4, 4: 204, 5: 3, 6: 84, 8: 103},
 'sum': {3: 82.2, 4: 5974.5, 5: 82.1, 6: 1678.8, 8: 1541.2}}

## Map

In [15]:
continents = {1: "North America", 2: "Europe", 3: "Asia"}
df["origin_name"] = df["origin"].map(continents)
df

Unnamed: 0,mpg,cylinders,displacement,...,origin,name,origin_name
92,18.0,6,250.0,...,1,chevrolet nova,North America
221,32.4,4,107.0,...,3,honda accord,Asia
65,27.0,4,112.0,...,1,chevrolet cavalier wagon,North America
264,23.9,8,260.0,...,1,oldsmobile cutlass salon brougham,North America
354,26.0,4,97.0,...,3,toyota corolla liftback,Asia
...,...,...,...,...,...,...,...
269,23.8,4,151.0,...,1,oldsmobile starfire sx,North America
211,23.0,4,140.0,...,1,ford pinto,North America
69,17.5,8,305.0,...,1,chevrolet chevelle malibu classic,North America
199,21.0,6,200.0,...,1,ford maverick,North America


### Lambdas

Připomenutí NaN (IEEE754)

In [26]:
float("NaN")

nan

Všimněte si typu u položky `horsepower`. Proč?

In [23]:
df["horsepower"]

0      130.0
1      165.0
2      150.0
3      150.0
4      140.0
       ...  
393    86.00
394    52.00
395    84.00
396    79.00
397    82.00
Name: horsepower, Length: 398, dtype: object

In [29]:
df["horsepower"].dtype

dtype('O')

In [28]:
hp = df.apply(lambda x: float("NaN" if x["horsepower"] == "?" else x["horsepower"]), axis=1) 
hp

0      130.0
1      165.0
2      150.0
3      150.0
4      140.0
       ...  
393     86.0
394     52.0
395     84.0
396     79.0
397     82.0
Length: 398, dtype: float64

In [30]:
df['horsepower'] = hp
df

Unnamed: 0,mpg,cylinders,displacement,...,year,origin,name
0,18.0,8,307.0,...,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,...,70,1,buick skylark 320
2,18.0,8,318.0,...,70,1,plymouth satellite
3,16.0,8,304.0,...,70,1,amc rebel sst
4,17.0,8,302.0,...,70,1,ford torino
...,...,...,...,...,...,...,...
393,27.0,4,140.0,...,82,1,ford mustang gl
394,44.0,4,97.0,...,82,2,vw pickup
395,32.0,4,135.0,...,82,1,dodge rampage
396,28.0,4,120.0,...,82,1,ford ranger


In [31]:
efficiency = df.apply(lambda x: x["displacement"] / x["horsepower"], axis=1)
display(efficiency[0:10])

0    2.361538
1    2.121212
2    2.120000
3    2.026667
4    2.157143
5    2.166667
6    2.063636
7    2.046512
8    2.022222
9    2.052632
dtype: float64

In [32]:
df["efficiency"] = efficiency