# Exercise 04 : Enrichment and transformations

In [42]:
import pandas as pd
import numpy as np

## Read the JSON file that you saved in ex02
- one of the columns has the float type, so let us define the format of it in pandas using pd.options.display.float_format: floats should be displayed with two decimals
- there are values missing from the Model, do not do anything with them

In [43]:
df = pd.read_json('../data/auto.json', orient='records')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


In [44]:
pd.options.display.float_format = '{:.2f}'.format
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


## Enrich the dataframe using a sample from that dataframe

- create a sample with 200 new observations with random_state = 21
 
 the sample should not have new combinations of the car number, make and model, so the whole dataset will be consistent in these terms
 
 there are no restrictions on the refund and fines, you can take any value from these columns at random and use it towards any car number
- concatenate the sample with the initial dataframe to a new dataframe con- cat_rows

In [45]:
sample = df[['CarNumber', 'Make', 'Model']].sample(n=200, replace=True, random_state=21)

sample['Fines'] = np.random.choice(df['Fines'], size=200)
sample['Refund'] = np.random.choice(df['Refund'], size=200)
sample

Unnamed: 0,CarNumber,Make,Model,Fines,Refund
207,Y351O8197RUS,Ford,Focus,6000.00,1
48,H917TC36RUS,Ford,Focus,1000.00,2
368,C589EY154RUS,Ford,Focus,3600.00,1
120,K846YE77RUS,Volkswagen,Passat,13000.00,2
419,X4108H125RUS,Ford,Focus,1600.00,2
...,...,...,...,...,...
587,M942OT152RUS,Ford,Focus,1000.00,1
595,Y187O8161RUS,Ford,Focus,3000.00,2
365,7064C8197RUS,Volkswagen,Passat,2500.00,1
474,8437XX154RUS,Ford,Focus,12800.00,1


In [46]:
concat_rows = pd.concat([df, sample], ignore_index=True)
concat_rows

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
920,M942OT152RUS,1,1000.00,Ford,Focus
921,Y187O8161RUS,2,3000.00,Ford,Focus
922,7064C8197RUS,1,2500.00,Volkswagen,Passat
923,8437XX154RUS,1,12800.00,Ford,Focus


## Enrich the dataframe concat_rows by a new column with the data generated

- create a series with the name Year using random integers from 1980 to 2019
- use np.random.seed(21) before generating the years
- concatenate the series with the dataframe and name it fines

In [47]:
np.random.seed(21)
year = pd.Series()
year = np.random.randint(1980, 2020, size=concat_rows.shape[0])
fines = concat_rows.copy()
fines['Year'] = year
fines['Year'] = fines['Year'].astype(int)
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
920,M942OT152RUS,1,1000.00,Ford,Focus,1981
921,Y187O8161RUS,2,3000.00,Ford,Focus,1992
922,7064C8197RUS,1,2500.00,Volkswagen,Passat,2007
923,8437XX154RUS,1,12800.00,Ford,Focus,2005


## Enrich the dataframe with the data from another dataframe

- create a new dataframe with the car numbers and their owners
- get the most popular surnames (you can find the file surname.json in the attachments) in the US
- create a new series with the surnames (they should not have special char- acters like commas, brackets, etc.) from the data you gathered, the count should be equal to the number of unique car numbers using the sample (use random_state = 21)
- create the dataframe owners with 2 columns: CarNumber and SURNAME
- append 5 more observations to the fines dataframe (come up with your own ideas of CarNumber, etc.)
- delete the dataframe last 20 observations from the owners and add 3 new observations (they are not the same as those you add to the fines dataframe)
- join both dataframes:
- the new dataframe should have only the car numbers that exist in both dataframes
- the new dataframe should have all the car numbers that exist in both dataframes
- the new dataframe should have only the car numbers from the fines dataframe
- the new dataframe should have only the car numbers from the owners dataframe

In [48]:
popular_surnames = pd.read_json('../../datasets/surname.json')
popular_surnames.drop(0, inplace=True)
popular_surnames.columns = ["NAME", "COUNT", "RANK"]
popular_surnames.reset_index(drop=True, inplace=True)
popular_surnames

Unnamed: 0,NAME,COUNT,RANK
0,ADAMS,427865,42
1,ALLEN,482607,33
2,ALVAREZ,233983,92
3,ANDERSON,784404,15
4,BAILEY,277845,72
...,...,...,...
95,WILLIAMS,1625252,3
96,WILSON,801882,14
97,WOOD,250715,84
98,WRIGHT,458980,35


In [49]:
popular_surnames[['COUNT', 'RANK']] = popular_surnames[['COUNT', 'RANK']].astype(int)
popular_surnames.sort_values('RANK', ascending=True, inplace=True)
popular_surnames

Unnamed: 0,NAME,COUNT,RANK
84,SMITH,2442977,1
40,JOHNSON,1932812,2
95,WILLIAMS,1625252,3
8,BROWN,1437026,4
41,JONES,1425470,5
...,...,...,...
59,MYERS,229895,96
47,LONG,229374,97
79,ROSS,229368,98
24,FOSTER,227764,99


In [50]:
count_car_num = fines['CarNumber'].nunique()

surnames = popular_surnames['NAME'].sample(n=count_car_num, replace=True, random_state=21)
surnames

42         KELLY
73    RICHARDSON
18          CRUZ
41         JONES
10        CARTER
         ...    
50      MARTINEZ
99         YOUNG
25        GARCIA
45           LEE
18          CRUZ
Name: NAME, Length: 531, dtype: object

In [51]:
owners = pd.DataFrame({
    'CarNumber': fines['CarNumber'].drop_duplicates(),
    'SURNAME': surnames.values
})
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,KELLY
1,E432XX77RUS,RICHARDSON
2,7184TT36RUS,CRUZ
3,X582HE161RUS,JONES
4,92918M178RUS,CARTER
...,...,...
715,O136HO197RUS,MARTINEZ
719,O22097197RUS,YOUNG
721,M0309X197RUS,GARCIA
722,O673E8197RUS,LEE


In [52]:
additional_fines = pd.DataFrame({
    'CarNumber': ['R245PO33RUS', 'M345NO90RUS', 'P235QR234RUS', 'S223TU567RUS', 'V285WX890RUS'],
    'Refund': [2.00, 1.00, 2.00, 1.00, 1.00],
    'Fines': [4560.00, 3456.00, 2000.00, 56999.00, 12563.00],
    'Make': ['Kia', 'Ford', 'Nissan', 'Lamborgini', 'Porche'],
    'Model': ['Rio', 'Focus', 'Qashqai', 'Aventador', 'Cayman'],
    'Year': [2011, 2022, 2016, 2000, 2009]
})
fines = pd.concat([fines, additional_fines], ignore_index=True)
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1984
3,X582HE161RUS,2.00,2000.00,Ford,Focus,2015
4,92918M178RUS,1.00,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
925,R245PO33RUS,2.00,4560.00,Kia,Rio,2011
926,M345NO90RUS,1.00,3456.00,Ford,Focus,2022
927,P235QR234RUS,2.00,2000.00,Nissan,Qashqai,2016
928,S223TU567RUS,1.00,56999.00,Lamborgini,Aventador,2000


In [53]:
if len(owners) > 20:
    owners = owners.iloc[:-20]
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,KELLY
1,E432XX77RUS,RICHARDSON
2,7184TT36RUS,CRUZ
3,X582HE161RUS,JONES
4,92918M178RUS,CARTER
...,...,...
681,T914CT197RUS,KING
682,E41977152RUS,GARCIA
684,9464EX178RUS,ROBERTS
685,O50197197RUS,FOSTER


In [54]:
new_owners = pd.DataFrame({
    'CarNumber': ['Y246ZA123RUS', 'B138CD456RUS', 'F230FG789RUS'],
    'SURNAME': ['BLACK', 'SAND', 'WHITE']
})

owners = pd.concat([owners, new_owners], ignore_index=True)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,KELLY
1,E432XX77RUS,RICHARDSON
2,7184TT36RUS,CRUZ
3,X582HE161RUS,JONES
4,92918M178RUS,CARTER
...,...,...
509,O50197197RUS,FOSTER
510,7608EE777RUS,WRIGHT
511,Y246ZA123RUS,BLACK
512,B138CD456RUS,SAND


In [55]:
# Новый DataFrame с только номерами автомобилей, которые существуют в обоих DataFrame
inner_join = pd.merge(fines, owners, on='CarNumber', how='inner')
inner_join

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989,KELLY
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995,RICHARDSON
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1984,CRUZ
3,X582HE161RUS,2.00,2000.00,Ford,Focus,2015,JONES
4,92918M178RUS,1.00,5700.00,Ford,Focus,2014,CARTER
...,...,...,...,...,...,...,...
898,M942OT152RUS,1.00,1000.00,Ford,Focus,1981,WILLIAMS
899,Y187O8161RUS,2.00,3000.00,Ford,Focus,1992,THOMAS
900,7064C8197RUS,1.00,2500.00,Volkswagen,Passat,2007,MARTIN
901,8437XX154RUS,1.00,12800.00,Ford,Focus,2005,YOUNG


In [56]:
# Новый DataFrame со всеми номерами автомобилей, которые существуют в обоих DataFrame
outer_join = pd.merge(fines, owners, on='CarNumber', how='outer')
outer_join

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,704687163RUS,2.00,1400.00,Ford,Focus,2004.00,SMITH
1,704787163RUS,2.00,2800.00,Ford,Focus,1992.00,CRUZ
2,704987163RUS,2.00,8594.59,Ford,Focus,1985.00,TURNER
3,705287163RUS,2.00,2000.00,Ford,Focus,1980.00,CLARK
4,705387163RUS,2.00,700.00,Ford,Focus,1987.00,BENNETT
...,...,...,...,...,...,...,...
928,Y969O8197RUS,2.00,7800.00,Ford,Focus,1992.00,CARTER
929,Y973O8197RUS,2.00,8594.59,Ford,Focus,2005.00,JIMENEZ
930,Y973O8197RUS,1.00,34800.00,Ford,Focus,2003.00,JIMENEZ
931,Y973O8197RUS,1.00,69600.00,Ford,Focus,2017.00,JIMENEZ


In [57]:
# Новый DataFrame только с номерами автомобилей из DataFrame fines
fines_only = pd.merge(fines, owners, on='CarNumber', how='left')
fines_only

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989,KELLY
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995,RICHARDSON
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1984,CRUZ
3,X582HE161RUS,2.00,2000.00,Ford,Focus,2015,JONES
4,92918M178RUS,1.00,5700.00,Ford,Focus,2014,CARTER
...,...,...,...,...,...,...,...
925,R245PO33RUS,2.00,4560.00,Kia,Rio,2011,
926,M345NO90RUS,1.00,3456.00,Ford,Focus,2022,
927,P235QR234RUS,2.00,2000.00,Nissan,Qashqai,2016,
928,S223TU567RUS,1.00,56999.00,Lamborgini,Aventador,2000,


In [62]:
# Новый DataFrame только с номерами автомобилей из DataFrame owners
owners_only = pd.merge(fines, owners, on='CarNumber', how='right')
owners_only

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,KELLY
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1980.00,KELLY
2,Y163O8161RUS,2.00,8594.59,Ford,Focus,2019.00,KELLY
3,Y163O8161RUS,2.00,300.00,Ford,Focus,2017.00,KELLY
4,Y163O8161RUS,2.00,800.00,Ford,Focus,2017.00,KELLY
...,...,...,...,...,...,...,...
901,O50197197RUS,2.00,7800.00,Ford,Focus,1992.00,FOSTER
902,7608EE777RUS,1.00,4000.00,Skoda,Octavia,2000.00,WRIGHT
903,Y246ZA123RUS,,,,,,BLACK
904,B138CD456RUS,,,,,,SAND


In [63]:
pivot_table = fines.pivot_table(
    values='Fines', 
    index=['Make', 'Model'],
    columns='Year',           
    aggfunc='sum',            
)
pivot_table

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2022
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Ford,Focus,63694.59,391989.17,177283.76,61500.0,101094.59,134078.35,85994.59,79494.59,84694.59,57500.0,...,79889.17,99900.0,198689.17,128789.17,236594.59,113794.59,256000.0,341794.59,94894.59,3456.0
Ford,Mondeo,,,,,,,,,,8600.0,...,,34400.0,,,,46200.0,,,,
Kia,Rio,,,,,,,,,,,...,4560.0,,,,,,,,,
Lamborgini,Aventador,,,,,,,,,,,...,,,,,,,,,,
Nissan,Qashqai,,,,,,,,,,,...,,,,,,2000.0,,,,
Porche,Cayman,,,,,,,,,,,...,,,,,,,,,,
Skoda,Octavia,3500.0,,6900.0,11594.59,3600.0,10294.59,600.0,8300.0,,91400.0,...,500.0,500.0,15194.59,20300.0,46394.59,300.0,,156200.0,9500.0,
Toyota,Camry,46000.0,8594.59,,7200.0,,,,,,22400.0,...,,112194.59,,,,2000.0,20800.0,13000.0,18100.0,
Toyota,Corolla,,,2000.0,,,,45000.0,8000.0,,4000.0,...,8594.59,,,,,,9600.0,,27000.0,
Volkswagen,Golf,30900.0,,,8594.59,300.0,24000.0,,9300.0,,7000.0,...,20100.0,,10400.0,,2300.0,,,,,


## Save both the fines and owners dataframes to CSV files without an index

In [64]:
fines.to_csv('../data/fines.csv', index=False)
owners.to_csv('../data/owners.csv', index=False)