In [53]:
import pandas as pd
import numpy as np

read the JSON file that you saved in ex02

In [54]:
enrichment = pd.read_json('../data/auto.json')

one of the columns has the float type, 

so let us define the format of it in pandas using pd.options.display.float_format: 

floats should be displayed with two decimals

In [55]:
pd.set_option('display.float_format',  '{:,.2f}'.format)

checking missing values in Model col

In [56]:
enrichment['Model'].isnull().value_counts()

False    716
True       9
Name: Model, dtype: int64

In [57]:
enrichment

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


create a sample with 200 new observations with random_state = 21

∗ the sample should not have new combinations of the car number, make
and model, so the whole dataset will be consistent in these terms

∗ there are no restrictions on the refund and fines, you can take any value
from these columns at random and use it towards any car number

In [58]:
sample = enrichment.sample(n=200, random_state=21).reset_index(drop=True)
sample

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,M0299X197RUS,2,19200.00,Ford,Focus
1,83298C154RUS,2,3800.00,Ford,Focus
2,H957HY161RUS,1,2000.00,Ford,Focus
3,T941CC96RUS,1,2000.00,Ford,Focus
4,H966HY161RUS,1,500.00,Ford,Focus
...,...,...,...,...,...
195,8182XX154RUS,1,200.00,Ford,Focus
196,X796TH96RUS,1,500.00,Ford,Focus
197,T011MY163RUS,2,4000.00,Ford,Focus
198,T341CC96RUS,2,1000.00,Volkswagen,Passat


In [59]:
sample_for_Refund = enrichment.sample(n=200, random_state=42).reset_index(drop=True)
sample_for_Refund['Refund']

0      1
1      2
2      2
3      1
4      1
      ..
195    1
196    1
197    2
198    1
199    2
Name: Refund, Length: 200, dtype: int64

In [60]:
sample_for_Fines = enrichment.sample(n=200, random_state=30).reset_index(drop=True)
sample_for_Fines['Fines']

0     8,300.00
1     3,400.00
2     1,300.00
3     2,400.00
4     2,400.00
        ...   
195   9,500.00
196   2,000.00
197   4,400.00
198     800.00
199   1,400.00
Name: Fines, Length: 200, dtype: float64

In [61]:
sample['Fines'] = sample_for_Fines['Fines']
sample['Refund'] = sample_for_Refund['Refund']
sample

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,M0299X197RUS,1,8300.00,Ford,Focus
1,83298C154RUS,2,3400.00,Ford,Focus
2,H957HY161RUS,2,1300.00,Ford,Focus
3,T941CC96RUS,1,2400.00,Ford,Focus
4,H966HY161RUS,1,2400.00,Ford,Focus
...,...,...,...,...,...
195,8182XX154RUS,1,9500.00,Ford,Focus
196,X796TH96RUS,1,2000.00,Ford,Focus
197,T011MY163RUS,2,4400.00,Ford,Focus
198,T341CC96RUS,1,800.00,Volkswagen,Passat


concatenate the sample with the initial dataframe to a new dataframe concat_rows

In [62]:
concat_rows = pd.concat([enrichment, sample])
concat_rows

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
195,8182XX154RUS,1,9500.00,Ford,Focus
196,X796TH96RUS,1,2000.00,Ford,Focus
197,T011MY163RUS,2,4400.00,Ford,Focus
198,T341CC96RUS,1,800.00,Volkswagen,Passat


enrich the dataframe concat_rows by a new column with the data generated

use np.random.seed(21) before generating the years

In [63]:
np.random.seed(21)

create a series with the name Year using random integers from 1980 to 2019

In [64]:
year_Series = pd.Series([np.random.randint(1980,2019) for i in range(len(concat_rows))])
year_Series

0      1989
1      1995
2      1984
3      2015
4      2014
       ... 
920    1996
921    2002
922    1996
923    2012
924    1984
Length: 925, dtype: int64

concatenate the series with the dataframe and name it fines

In [65]:
concat_rows['Year'] = year_Series
fines = concat_rows

fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
195,8182XX154RUS,1,9500.00,Ford,Focus,2012
196,X796TH96RUS,1,2000.00,Ford,Focus,1998
197,T011MY163RUS,2,4400.00,Ford,Focus,1983
198,T341CC96RUS,1,800.00,Volkswagen,Passat,1983


create a new dataframe with the car numbers and their owners

get the most popular surnames(you can find the file surname.json in
the attachments) in the US

In [66]:
# немного подшаманить с файлом, чтобы индексы были на месте

surname = pd.read_json('../data/surname.json')
surname.columns = surname.iloc[0]
surname = surname.iloc[1: , :].reset_index(drop=True)

In [67]:
surname

Unnamed: 0,NAME,COUNT,RANK
0,ADAMS,427865,42
1,ALLEN,482607,33
2,ALVAREZ,233983,92
3,ANDERSON,784404,15
4,BAILEY,277845,72
...,...,...,...
95,WILLIAMS,1625252,3
96,WILSON,801882,14
97,WOOD,250715,84
98,WRIGHT,458980,35


create a new series with the surnames (they should not have special characters like commas, brackets, etc.) 

from the data you gathered, the count should be equal to the number of unique car numbers using the sample (use random_state = 21)

In [68]:
# проверим список фамилий на наличие спец символов
surname['NAME'].unique()

array(['ADAMS', 'ALLEN', 'ALVAREZ', 'ANDERSON', 'BAILEY', 'BAKER',
       'BENNETT', 'BROOKS', 'BROWN', 'CAMPBELL', 'CARTER', 'CASTILLO',
       'CHAVEZ', 'CLARK', 'COLLINS', 'COOK', 'COOPER', 'COX', 'CRUZ',
       'DAVIS', 'DIAZ', 'EDWARDS', 'EVANS', 'FLORES', 'FOSTER', 'GARCIA',
       'GOMEZ', 'GONZALEZ', 'GRAY', 'GREEN', 'GUTIERREZ', 'HALL',
       'HARRIS', 'HERNANDEZ', 'HILL', 'HOWARD', 'HUGHES', 'JACKSON',
       'JAMES', 'JIMENEZ', 'JOHNSON', 'JONES', 'KELLY', 'KIM', 'KING',
       'LEE', 'LEWIS', 'LONG', 'LOPEZ', 'MARTIN', 'MARTINEZ', 'MENDOZA',
       'MILLER', 'MITCHELL', 'MOORE', 'MORALES', 'MORGAN', 'MORRIS',
       'MURPHY', 'MYERS', 'NELSON', 'NGUYEN', 'ORTIZ', 'PARKER', 'PATEL',
       'PEREZ', 'PETERSON', 'PHILLIPS', 'PRICE', 'RAMIREZ', 'RAMOS',
       'REED', 'REYES', 'RICHARDSON', 'RIVERA', 'ROBERTS', 'ROBINSON',
       'RODRIGUEZ', 'ROGERS', 'ROSS', 'RUIZ', 'SANCHEZ', 'SANDERS',
       'SCOTT', 'SMITH', 'STEWART', 'TAYLOR', 'THOMAS', 'THOMPSON',
       'TORRES', 'TU

In [69]:
# соберем уникальные номера машин и посчитаем их количество
unique_car_numbers = concat_rows['CarNumber'].drop_duplicates().reset_index(drop=True)
len_owners = len(unique_car_numbers)

In [70]:
# наберем рандомный список фамилий с random_state=21
owner_name = surname['NAME'].sample(n=len_owners, random_state=21, replace=True, ignore_index=True)

∗ create the dataframe owners with 2 columns: CarNumber and SURNAME

In [71]:
owners = pd.DataFrame(data=unique_car_numbers)
owners['SURNAME'] = owner_name

owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
526,O136HO197RUS,CAMPBELL
527,O22097197RUS,HALL
528,M0309X197RUS,BAKER
529,O673E8197RUS,DIAZ


append 5 more observations to the fines dataframe (come up with your own ideas of CarNumber, etc.)

In [72]:
data1 = pd.Series(['X000XX00RUS', 'X111XX11RUS', 'X222XX22RUS','X333XX33RUS', 'X444XX44RUS'], name='CarNumber')
data2 = pd.Series([np.random.choice(fines['Refund']) for i in range(5)], name= 'Refund')
data3 = pd.Series([np.random.choice(fines['Fines']) for i in range(5)], name= 'Fines')
data4 = pd.Series([np.random.choice(fines['Make']) for i in range(5)], name= 'Make')
data5 = pd.Series([np.random.choice(fines['Model']) for i in range(5)], name= 'Model')
data6 = pd.Series([np.random.choice(fines['Year']) for i in range(5)], name= 'Year')


df = pd.DataFrame([data1, data2, data3, data4, data5, data6])
fines = pd.concat([fines, df.T])
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
0,X000XX00RUS,1,5800.00,Toyota,Focus,2017
1,X111XX11RUS,1,41700.00,Ford,Focus,2008
2,X222XX22RUS,1,200.00,Ford,Focus,1985
3,X333XX33RUS,2,800.00,Ford,Focus,2000


delete the dataframe last 20 observations from the owners and add 3 new observations (they are not the same as those you add to the fines dataframe)

In [73]:
owners = owners.drop(owners.tail(20).index)
col1 = pd.Series(['X555XX55RUS', 'X666XX66RUS', 'X777XX77RUS'], name='CarNumber')
col2 = pd.Series([np.random.choice(owners['SURNAME']) for i in range(3)], name= 'SURNAME')
Car_n_Name = pd.DataFrame([col1, col2])
owners = pd.concat([owners, Car_n_Name.T])
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
509,O50197197RUS,WRIGHT
510,7608EE777RUS,HILL
0,X555XX55RUS,TAYLOR
1,X666XX66RUS,ROSS


 join both dataframes:


the new dataframe should have only the car numbers that exist in both
dataframes

In [74]:
fines_owner_intersection = pd.merge(fines,owners, on='CarNumber',how='inner')
fines_owner_intersection

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,RICHARDSON
1,Y163O8161RUS,2,1600.00,Ford,Focus,1999,RICHARDSON
2,E432XX77RUS,1,6500.00,Toyota,Camry,1995,ROSS
3,E432XX77RUS,2,13000.00,Toyota,Camry,1992,ROSS
4,7184TT36RUS,1,2100.00,Ford,Focus,1984,MORGAN
...,...,...,...,...,...,...,...
894,E41977152RUS,2,2400.00,Ford,Focus,2001,BAKER
895,9464EX178RUS,2,2100.00,Ford,Focus,1993,MARTIN
896,O50197197RUS,2,7800.00,Ford,Focus,1986,WRIGHT
897,7608EE777RUS,1,4000.00,Skoda,Octavia,2013,HILL


the new dataframe should have all the car numbers that exist in both dataframes

In [75]:
fines_owner_union = pd.merge(fines, owners, on='CarNumber',how='outer')
fines_owner_union 

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,RICHARDSON
1,Y163O8161RUS,2,1600.00,Ford,Focus,1999,RICHARDSON
2,E432XX77RUS,1,6500.00,Toyota,Camry,1995,ROSS
3,E432XX77RUS,2,13000.00,Toyota,Camry,1992,ROSS
4,7184TT36RUS,1,2100.00,Ford,Focus,1984,MORGAN
...,...,...,...,...,...,...,...
928,X333XX33RUS,2,800.00,Ford,Focus,2000,
929,X444XX44RUS,1,7000.00,Ford,Focus,1987,
930,X555XX55RUS,,,,,,TAYLOR
931,X666XX66RUS,,,,,,ROSS


thenewdataframeshouldhaveonlythecarnumbersfromthefinesdataframe

In [76]:
fines_carnumbers_owner = pd.merge(fines, owners, on='CarNumber',how='left')
fines_carnumbers_owner

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,RICHARDSON
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995,ROSS
2,7184TT36RUS,1,2100.00,Ford,Focus,1984,MORGAN
3,X582HE161RUS,2,2000.00,Ford,Focus,2015,BAILEY
4,92918M178RUS,1,5700.00,Ford,Focus,2014,LOPEZ
...,...,...,...,...,...,...,...
925,X000XX00RUS,1,5800.00,Toyota,Focus,2017,
926,X111XX11RUS,1,41700.00,Ford,Focus,2008,
927,X222XX22RUS,1,200.00,Ford,Focus,1985,
928,X333XX33RUS,2,800.00,Ford,Focus,2000,


the new dataframe should have only the car numbers from the owners dataframe

In [77]:
owner_carnumbers_fines = pd.merge(fines, owners, on='CarNumber',how='right')
owner_carnumbers_fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989,RICHARDSON
1,Y163O8161RUS,2,1600.00,Ford,Focus,1999,RICHARDSON
2,E432XX77RUS,1,6500.00,Toyota,Camry,1995,ROSS
3,E432XX77RUS,2,13000.00,Toyota,Camry,1992,ROSS
4,7184TT36RUS,1,2100.00,Ford,Focus,1984,MORGAN
...,...,...,...,...,...,...,...
897,7608EE777RUS,1,4000.00,Skoda,Octavia,2013,HILL
898,7608EE777RUS,1,34000.00,Skoda,Octavia,2011,HILL
899,X555XX55RUS,,,,,,TAYLOR
900,X666XX66RUS,,,,,,ROSS


create a pivot table from the fines dataframe

In [78]:
pd.pivot_table(fines,
               columns='Year',
               values='Fines',
               index=['Make', 'Model'],
               aggfunc={'Fines': np.sum})


Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Ford,Focus,62600.0,323300.0,138800.0,137800.0,107900.0,470600.0,62200.0,71500.0,37600.0,202000.0,...,219600.0,239200.0,96600.0,182400.0,237600.0,84400.0,348400.0,106300.0,146500.0,103200.0
Ford,Mondeo,,,46200.0,,,,,,,,...,,,,,41100.0,,,,8600.0,
Skoda,Octavia,7000.0,1900.0,36600.0,16300.0,300.0,146400.0,,2000.0,5100.0,15600.0,...,,2500.0,37000.0,1700.0,11800.0,7000.0,27600.0,45300.0,2400.0,153200.0
Toyota,Camry,12000.0,,,,1000.0,,24400.0,,,800.0,...,,22400.0,,7500.0,,,,,,
Toyota,Corolla,,6800.0,,12800.0,2500.0,,,54800.0,,7800.0,...,8718.51,6000.0,,,3600.0,,,,,
Toyota,Focus,,,,,,,,,,,...,,,,,,,,,5800.0,
Volkswagen,Golf,32200.0,2000.0,5000.0,200.0,,168000.0,,,,300.0,...,,,,,,43600.0,,,,
Volkswagen,Jetta,,1000.0,,,,9000.0,,,46000.0,,...,,,,,1100.0,,,,,
Volkswagen,Passat,900.0,4100.0,,1900.0,4200.0,,16000.0,2000.0,10300.0,,...,,9500.0,,,1600.0,6700.0,,,,
Volkswagen,Touareg,,,,,,,,,,,...,5800.0,,,,,,,,,


In [79]:
fines.to_csv('../data/fines.csv', index=False)
owners.to_csv('../data/owners.csv', index=False)