# Exercise 04 : Enrichment and transformations
## Required data

In [1]:
%ls ../data/auto.json

../data/auto.json


## Imports

In [2]:
import pandas as pd
import numpy as np
import requests

## Read the data

In [3]:
df = pd.read_json('../data/auto.json',
                  orient='records'
                 )

df.set_index('CarNumber',
             inplace=True
            )

df

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y163O8161RUS,2,3200.000000,Ford,Focus
E432XX77RUS,1,6500.000000,Toyota,Camry
7184TT36RUS,1,2100.000000,Ford,Focus
X582HE161RUS,2,2000.000000,Ford,Focus
92918M178RUS,1,5700.000000,Ford,Focus
...,...,...,...,...
Y163O8161RUS,2,1600.000000,Ford,Focus
M0309X197RUS,1,22300.000000,Ford,Focus
O673E8197RUS,2,600.000000,Ford,Focus
8610T8154RUS,1,2000.000000,Ford,Focus


## Set up float numbers display setting

In [4]:
pd.options.display.float_format = '{:.2f}'.format

df

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y163O8161RUS,2,3200.00,Ford,Focus
E432XX77RUS,1,6500.00,Toyota,Camry
7184TT36RUS,1,2100.00,Ford,Focus
X582HE161RUS,2,2000.00,Ford,Focus
92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...
Y163O8161RUS,2,1600.00,Ford,Focus
M0309X197RUS,1,22300.00,Ford,Focus
O673E8197RUS,2,600.00,Ford,Focus
8610T8154RUS,1,2000.00,Ford,Focus


## Enrich the dataframe using a sample from that dataframe

In [5]:
sample = df.sample(200, random_state=21)

sample['Refund'] = np.random.randint(1, 10, size=sample.shape[0])
sample['Fines'] = np.random.ranf(size=sample.shape[0]) * 30000

sample

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M0299X197RUS,7,23050.50,Ford,Focus
83298C154RUS,8,5116.10,Ford,Focus
H957HY161RUS,3,7463.99,Ford,Focus
T941CC96RUS,4,15556.38,Ford,Focus
H966HY161RUS,1,6778.41,Ford,Focus
...,...,...,...,...
8182XX154RUS,2,14253.34,Ford,Focus
X796TH96RUS,2,3888.26,Ford,Focus
T011MY163RUS,4,7061.17,Ford,Focus
T341CC96RUS,3,17518.26,Volkswagen,Passat


In [6]:
concat_rows = pd.concat([df, sample])
concat_rows

Unnamed: 0_level_0,Refund,Fines,Make,Model
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Y163O8161RUS,2,3200.00,Ford,Focus
E432XX77RUS,1,6500.00,Toyota,Camry
7184TT36RUS,1,2100.00,Ford,Focus
X582HE161RUS,2,2000.00,Ford,Focus
92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...
8182XX154RUS,2,14253.34,Ford,Focus
X796TH96RUS,2,3888.26,Ford,Focus
T011MY163RUS,4,7061.17,Ford,Focus
T341CC96RUS,3,17518.26,Volkswagen,Passat


## Enrich the dataframe concat_rows by a new column with the data generated

In [7]:
np.random.seed(21)

In [8]:
years = pd.Series(np.random.randint(1980, 2020, concat_rows.shape[0]), name='Year', index=concat_rows.index)
years

CarNumber
Y163O8161RUS    1989
E432XX77RUS     1995
7184TT36RUS     1984
X582HE161RUS    2015
92918M178RUS    2014
                ... 
8182XX154RUS    1981
X796TH96RUS     1992
T011MY163RUS    2007
T341CC96RUS     2005
T119CT96RUS     1997
Name: Year, Length: 925, dtype: int64

In [9]:
fines = pd.concat([concat_rows, years], axis='columns')
fines

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Y163O8161RUS,2,3200.00,Ford,Focus,1989
E432XX77RUS,1,6500.00,Toyota,Camry,1995
7184TT36RUS,1,2100.00,Ford,Focus,1984
X582HE161RUS,2,2000.00,Ford,Focus,2015
92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...
8182XX154RUS,2,14253.34,Ford,Focus,1981
X796TH96RUS,2,3888.26,Ford,Focus,1992
T011MY163RUS,4,7061.17,Ford,Focus,2007
T341CC96RUS,3,17518.26,Volkswagen,Passat,2005


## Enrich the dataframe with the data from another dataframe
### Create a new dataframe with the car numbers and their owners
#### Get pupular names from `surname.json`

In [10]:
surnames = pd.read_json('../data/surname.json', orient='values')

surnames.columns = surnames.iloc[0].values
surnames.drop(surnames.index[0], axis='index', inplace=True)

surnames

Unnamed: 0,NAME,COUNT,RANK
1,ADAMS,427865,42
2,ALLEN,482607,33
3,ALVAREZ,233983,92
4,ANDERSON,784404,15
5,BAILEY,277845,72
...,...,...,...
96,WILLIAMS,1625252,3
97,WILSON,801882,14
98,WOOD,250715,84
99,WRIGHT,458980,35


### Get sample of surnames

In [11]:
surnames_sample = surnames.sample(fines.index.unique().shape[0], random_state=21, replace=True)['NAME']
surnames_sample

74    RICHARDSON
80          ROSS
57        MORGAN
5         BAILEY
49         LOPEZ
         ...    
10      CAMPBELL
32          HALL
6          BAKER
21          DIAZ
57        MORGAN
Name: NAME, Length: 531, dtype: object

### Create `owners` dataframe

In [12]:
owners = pd.DataFrame(zip(fines.index.unique(), surnames_sample), columns=['CarNumber', 'SURNAME'])
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
526,O136HO197RUS,CAMPBELL
527,O22097197RUS,HALL
528,M0309X197RUS,BAKER
529,O673E8197RUS,DIAZ


### Append 5 more observations to the `fines` dataframe (come up with your own ideas of `CarNumber`, etc.)

In [13]:
new_observations = [['X008RUS', 2, 5224.2, 'Toyota', 'Camry', 2014],
                    ['1111RUS', 1, 14.2, 'Toyota', 'Camry', 2014],
                    ['5AAA9RUS', 3, 512524.2, 'Toyota', 'Camry', 2014],
                    ['566RUS', 2, 124.2, 'Toyota', 'Camry', 2014],
                    ['A9999RUS', 1, 3334.2, 'Toyota', 'Camry', 2014]]

new_observations = pd.DataFrame(new_observations, columns=fines.reset_index().columns)
new_observations.set_index('CarNumber', inplace=True)

new_observations

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
X008RUS,2,5224.2,Toyota,Camry,2014
1111RUS,1,14.2,Toyota,Camry,2014
5AAA9RUS,3,512524.2,Toyota,Camry,2014
566RUS,2,124.2,Toyota,Camry,2014
A9999RUS,1,3334.2,Toyota,Camry,2014


In [14]:
fines = pd.concat([fines, new_observations])
fines

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Y163O8161RUS,2,3200.00,Ford,Focus,1989
E432XX77RUS,1,6500.00,Toyota,Camry,1995
7184TT36RUS,1,2100.00,Ford,Focus,1984
X582HE161RUS,2,2000.00,Ford,Focus,2015
92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...
X008RUS,2,5224.20,Toyota,Camry,2014
1111RUS,1,14.20,Toyota,Camry,2014
5AAA9RUS,3,512524.20,Toyota,Camry,2014
566RUS,2,124.20,Toyota,Camry,2014


### Delete the dataframe last 20 observations from the `owners` and add 3 new observations 
(they are not the same as those you add to the `fines` dataframe)

In [15]:
owners.drop(owners.tail(20).index, inplace=True)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
506,T914CT197RUS,HERNANDEZ
507,E41977152RUS,BAKER
508,9464EX178RUS,MARTIN
509,O50197197RUS,WRIGHT


In [16]:
new_data = [['A9999999989', 'IVAN'],
            ['B8888888888', 'OLEG'],
            ['C7777777777', 'STEPAN']]
new_data = pd.DataFrame(new_data, columns=owners.columns)


owners = pd.concat([owners, new_data])
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
509,O50197197RUS,WRIGHT
510,7608EE777RUS,HILL
0,A9999999989,IVAN
1,B8888888888,OLEG


### Join both dataframes:
* the new dataframe should have only the car numbers that exist in both
dataframes

In [17]:
pd.merge(fines, owners, how='inner', left_index=True, right_on='CarNumber')

Unnamed: 0,Refund,Fines,Make,Model,Year,CarNumber,SURNAME
0,2,3200.00,Ford,Focus,1989,Y163O8161RUS,RICHARDSON
0,2,1600.00,Ford,Focus,1980,Y163O8161RUS,RICHARDSON
1,1,6500.00,Toyota,Camry,1995,E432XX77RUS,ROSS
1,2,13000.00,Toyota,Camry,2018,E432XX77RUS,ROSS
2,1,2100.00,Ford,Focus,1984,7184TT36RUS,MORGAN
...,...,...,...,...,...,...,...
507,2,2400.00,Ford,Focus,1989,E41977152RUS,BAKER
508,2,2100.00,Ford,Focus,1988,9464EX178RUS,MARTIN
509,2,7800.00,Ford,Focus,1992,O50197197RUS,WRIGHT
510,1,4000.00,Skoda,Octavia,2000,7608EE777RUS,HILL


* the new dataframe should have all the car numbers that exist in both
dataframes

In [18]:
pd.merge(fines, owners, how='outer', left_index=True, right_on='CarNumber')

Unnamed: 0,Refund,Fines,Make,Model,Year,CarNumber,SURNAME
0.00,2.00,3200.00,Ford,Focus,1989.00,Y163O8161RUS,RICHARDSON
0.00,2.00,1600.00,Ford,Focus,1980.00,Y163O8161RUS,RICHARDSON
1.00,1.00,6500.00,Toyota,Camry,1995.00,E432XX77RUS,ROSS
1.00,2.00,13000.00,Toyota,Camry,2018.00,E432XX77RUS,ROSS
2.00,1.00,2100.00,Ford,Focus,1984.00,7184TT36RUS,MORGAN
...,...,...,...,...,...,...,...
,2.00,124.20,Toyota,Camry,2014.00,566RUS,
,1.00,3334.20,Toyota,Camry,2014.00,A9999RUS,
0.00,,,,,,A9999999989,IVAN
1.00,,,,,,B8888888888,OLEG


* the new dataframe should have only the car numbers from the fines dataframe

In [19]:
pd.merge(fines, owners, how='left', left_index=True, right_on='CarNumber')

Unnamed: 0,Refund,Fines,Make,Model,Year,CarNumber,SURNAME
0.00,2,3200.00,Ford,Focus,1989,Y163O8161RUS,RICHARDSON
1.00,1,6500.00,Toyota,Camry,1995,E432XX77RUS,ROSS
2.00,1,2100.00,Ford,Focus,1984,7184TT36RUS,MORGAN
3.00,2,2000.00,Ford,Focus,2015,X582HE161RUS,BAILEY
4.00,1,5700.00,Ford,Focus,2014,92918M178RUS,LOPEZ
...,...,...,...,...,...,...,...
,2,5224.20,Toyota,Camry,2014,X008RUS,
,1,14.20,Toyota,Camry,2014,1111RUS,
,3,512524.20,Toyota,Camry,2014,5AAA9RUS,
,2,124.20,Toyota,Camry,2014,566RUS,


* the new dataframe should have only the car numbers from the owners
dataframe

In [20]:
pd.merge(fines, owners, how='right', left_index=True, right_on='CarNumber')

Unnamed: 0,Refund,Fines,Make,Model,Year,CarNumber,SURNAME
0,2.00,3200.00,Ford,Focus,1989.00,Y163O8161RUS,RICHARDSON
0,2.00,1600.00,Ford,Focus,1980.00,Y163O8161RUS,RICHARDSON
1,1.00,6500.00,Toyota,Camry,1995.00,E432XX77RUS,ROSS
1,2.00,13000.00,Toyota,Camry,2018.00,E432XX77RUS,ROSS
2,1.00,2100.00,Ford,Focus,1984.00,7184TT36RUS,MORGAN
...,...,...,...,...,...,...,...
510,1.00,4000.00,Skoda,Octavia,2000.00,7608EE777RUS,HILL
510,8.00,8872.46,Skoda,Octavia,1991.00,7608EE777RUS,HILL
0,,,,,,A9999999989,IVAN
1,,,,,,B8888888888,OLEG


## Create a pivot table from the `fines` dataframe in specified format

In [21]:
pd.pivot_table(fines,
               index=['Make', 'Model'],
               values='Fines',
               columns='Year')

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Ford,Focus,6771.96,19761.36,11475.94,5454.33,10126.94,7877.85,7872.27,8848.8,7262.16,4773.9,...,8629.58,7882.51,6529.29,7144.94,8998.65,12877.2,7277.65,13202.33,22013.42,5723.0
Ford,Mondeo,,,,,,,,,,8600.0,...,,,34400.0,,,,46200.0,,,
Skoda,Octavia,15244.7,,9155.55,3864.86,,3431.53,600.0,5200.0,7253.04,45700.0,...,1550.0,500.0,500.0,4198.2,300.0,23197.29,300.0,24852.54,52066.67,3166.67
Toyota,Camry,12000.0,8594.59,,7200.0,,,,,,22400.0,...,,,8594.59,,88989.91,,,,18946.83,9050.0
Toyota,Corolla,,,2000.0,,,,,15839.4,,4000.0,...,24000.0,8594.59,3274.65,,,,11463.28,9600.0,25101.92,
Volkswagen,Golf,30900.0,,,8594.59,300.0,24000.0,,12397.89,,5800.0,...,,300.0,,24509.41,,2300.0,,,19274.74,
Volkswagen,Jetta,,,,,,,,,,,...,,,,,,,,,,
Volkswagen,Passat,,1600.0,,3200.0,10000.0,5000.0,15000.0,6150.0,,,...,1400.0,,,,,600.0,1050.0,,,
Volkswagen,Touareg,,,,,,5800.0,,,,,...,6300.0,,,,1300.0,500.0,,,,


## Save both the `fines` and `owners` dataframes to CSV files without an index

In [22]:
fines.to_csv('../data/fines.csv')
owners.to_csv('../data/owners.csv', index=False)

In [23]:
%cat ../data/fines.csv

CarNumber,Refund,Fines,Make,Model,Year
Y163O8161RUS,2,3200.0,Ford,Focus,1989
E432XX77RUS,1,6500.0,Toyota,Camry,1995
7184TT36RUS,1,2100.0,Ford,Focus,1984
X582HE161RUS,2,2000.0,Ford,Focus,2015
92918M178RUS,1,5700.0,Ford,Focus,2014
H234YH197RUS,2,6000.0,Ford,Focus,1990
E40577152RUS,1,8594.5864661654,Ford,Focus,1988
707987163RUS,2,2200.0,Ford,Focus,2016
K330T8197RUS,2,8200.0,Skoda,Octavia,2018
X786CO96RUS,1,8594.5864661654,Ford,Focus,2000
C477M7161RUS,1,2500.0,Ford,Focus,2000
O21997197RUS,1,2000.0,Ford,Focus,1992
M592CH197RUS,2,8594.5864661654,Skoda,Octavia,1985
9020YC197RUS,2,145000.0,Skoda,Octavia,2018
8182XX154RUS,1,200.0,Ford,Focus,2017
7830C8197RUS,2,8594.5864661654,Ford,Focus,1998
7066C8197RUS,2,15000.0,Volkswagen,Passat,1986
M298CH161RUS,2,8594.5864661654,Ford,Focus,2014
E445TC197RUS,1,8594.5864661654,Ford,Focus,1985
8440XX154RUS,1,6200.0,Ford,Focus,1996
9371CE154RUS,2,8594.5864661654,Skoda,Octavia,2015
9182CE154RUS,2,19800.0,Ford,Focus,2007
83298C154RUS,2,8594.5864661654,Ford,Focus

In [24]:
%cat ../data/owners.csv

CarNumber,SURNAME
Y163O8161RUS,RICHARDSON
E432XX77RUS,ROSS
7184TT36RUS,MORGAN
X582HE161RUS,BAILEY
92918M178RUS,LOPEZ
H234YH197RUS,HOWARD
E40577152RUS,NELSON
707987163RUS,WRIGHT
K330T8197RUS,RIVERA
X786CO96RUS,REYES
C477M7161RUS,PARKER
O21997197RUS,KING
M592CH197RUS,NGUYEN
9020YC197RUS,LOPEZ
8182XX154RUS,SMITH
7830C8197RUS,MYERS
7066C8197RUS,DIAZ
M298CH161RUS,ROBINSON
E445TC197RUS,MORGAN
8440XX154RUS,JONES
9371CE154RUS,BAKER
9182CE154RUS,CRUZ
83298C154RUS,RAMOS
Y7659C197RUS,MARTINEZ
C922YE197RUS,MORGAN
7364C8197RUS,WRIGHT
M5039X197RUS,LOPEZ
8603T8154RUS,MITCHELL
92928M178RUS,BAKER
O672E8197RUS,RUIZ
E42277152RUS,HOWARD
C903MC161RUS,WALKER
T7318T163RUS,HERNANDEZ
Y7719C197RUS,CHAVEZ
T011MY163RUS,SANDERS
H115YO163RUS,COOK
73467Y96RUS,BENNETT
O134HE197RUS,SMITH
H968HY161RUS,TURNER
Y654E8750RUS,GRAY
H963HY161RUS,CAMPBELL
H232YH197RUS,RUIZ
7843C8197RUS,ADAMS
X791O796RUS,BAKER
H958HY161RUS,RAMIREZ
708587163RUS,MORRIS
Y973O8197RUS,YOUNG
97977H178RUS,BAILEY
H917TC36RUS,ROBINSON
Y7709C197RUS,GUTIE