In [1]:
import pandas as pd

## load JSON

In [10]:
df = pd.read_json('../data/auto.json', orient='records')

## make the following selects

In [11]:
df.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.0,Ford,Focus
1,E432XX77RUS,1,6500.0,Toyota,Camry
2,7184TT36RUS,1,2100.0,Ford,Focus
3,X582HE161RUS,2,2000.0,Ford,Focus
4,92918M178RUS,1,5700.0,Ford,Focus


In [12]:
# display the rows only where the fines are more than 2,100
df.query('Fines > 2100')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.000000,Ford,Focus
1,E432XX77RUS,1,6500.000000,Toyota,Camry
4,92918M178RUS,1,5700.000000,Ford,Focus
5,H234YH197RUS,2,6000.000000,Ford,Focus
6,E40577152RUS,1,8594.586466,Ford,Focus
...,...,...,...,...,...
717,O718MM163RUS,2,8594.586466,Ford,Focus
718,7065C8197RUS,2,11400.000000,Volkswagen,Passat
719,O22097197RUS,1,24300.000000,Ford,Focus
721,M0309X197RUS,1,22300.000000,Ford,Focus


In [13]:
# display the rows only where the fines are more than 2,100 and the refund equals 2
df.query('Fines > 2100 & Refund == 2')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.000000,Ford,Focus
5,H234YH197RUS,2,6000.000000,Ford,Focus
7,707987163RUS,2,2200.000000,Ford,Focus
8,K330T8197RUS,2,8200.000000,Skoda,Octavia
12,M592CH197RUS,2,8594.586466,Skoda,Octavia
...,...,...,...,...,...
715,O136HO197RUS,2,7800.000000,Toyota,Corolla
716,O68897197RUS,2,12300.000000,Ford,Focus
717,O718MM163RUS,2,8594.586466,Ford,Focus
718,7065C8197RUS,2,11400.000000,Volkswagen,Passat


In [14]:
# display the rows only where the models are from the list: ['Focus', 'Corolla']
model_list = ['Focus', 'Corolla']
df.query('Model in @model_list')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.000000,Ford,Focus
2,7184TT36RUS,1,2100.000000,Ford,Focus
3,X582HE161RUS,2,2000.000000,Ford,Focus
4,92918M178RUS,1,5700.000000,Ford,Focus
5,H234YH197RUS,2,6000.000000,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.000000,Ford,Focus
721,M0309X197RUS,1,22300.000000,Ford,Focus
722,O673E8197RUS,2,600.000000,Ford,Focus
723,8610T8154RUS,1,2000.000000,Ford,Focus


In [15]:
# display the rows only where the car number is from the list: ['Y7689C197RUS', '92928M178RUS', '7788KT197RUS', 'H115YO163RUS', 'X758HY197RUS']
number_list = ['Y7689C197RUS', '92928M178RUS', '7788KT197RUS', 'H115YO163RUS', 'X758HY197RUS']
df.query('CarNumber in @number_list')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
28,92928M178RUS,1,8594.586466,Ford,Focus
35,H115YO163RUS,1,2200.0,Ford,Focus
71,7788KT197RUS,2,12000.0,Ford,Focus
98,X758HY197RUS,2,24200.0,Ford,Focus
158,X758HY197RUS,2,72600.0,Ford,Focus
183,Y7689C197RUS,1,27000.0,Ford,Focus
191,92928M178RUS,1,600.0,Ford,Focus
215,H115YO163RUS,1,8594.586466,Ford,Focus
272,H115YO163RUS,2,1100.0,Ford,Focus
283,7788KT197RUS,2,8594.586466,Ford,Focus


## make the aggregations with the make and the model

In [17]:
# display the median fines grouped by the make
df.groupby('Make')['Fines'].median()

Make
Audi          4200.0
BMW           6500.0
Ford          3500.0
Skoda         3250.0
Toyota        7700.0
Volkswagen    4300.0
Volvo         8500.0
Name: Fines, dtype: float64

In [18]:
# display the median fines grouped by the make and the model
df.groupby(['Make', 'Model'])['Fines'].median()

Make        Model  
Ford        Focus      3500.0
            Mondeo     7650.0
Skoda       Octavia    3250.0
Toyota      Camry      7700.0
            Corolla    7700.0
Volkswagen  Golf       4800.0
            Jetta      2800.0
            Passat     3500.0
            Touareg    5800.0
Name: Fines, dtype: float64

In [19]:
# display the number of fines grouped by the make and the model in order to
# understand if we can trust the median values
df.groupby(['Make', 'Model'])['Fines'].count()

Make        Model  
Ford        Focus      575
            Mondeo       6
Skoda       Octavia     48
Toyota      Camry       16
            Corolla     18
Volkswagen  Golf        20
            Jetta        6
            Passat      22
            Touareg      5
Name: Fines, dtype: int64

In [20]:
# display the minimum and the maximum fines grouped by the make and the model in
# order to better understand the variance
df.groupby(['Make', 'Model'])['Fines'].agg(['max', 'min'])

Unnamed: 0_level_0,Unnamed: 1_level_0,max,min
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1
Ford,Focus,180000.0,100.0
Ford,Mondeo,46200.0,1100.0
Skoda,Octavia,145000.0,300.0
Toyota,Camry,22400.0,500.0
Toyota,Corolla,34300.0,900.0
Volkswagen,Golf,168000.0,200.0
Volkswagen,Jetta,46000.0,500.0
Volkswagen,Passat,29700.0,100.0
Volkswagen,Touareg,8594.586466,500.0


In [21]:
# display the standard deviation of the fines grouped by the make and the model in
# order to better understand the variance
df.groupby(['Make', 'Model'])['Fines'].std()

Make        Model  
Ford        Focus      15041.269437
            Mondeo     18987.329108
Skoda       Octavia    24339.742174
Toyota      Camry       6410.250654
            Corolla     9629.325617
Volkswagen  Golf       36950.839950
            Jetta      17743.026799
            Passat      6969.739135
            Touareg     3461.778173
Name: Fines, dtype: float64

## make the aggregations with the car number

In [25]:
# display the car numbers grouped by the number of the fines in the descending order,
# we want to find those who most often violated the law
df.groupby('CarNumber')['Fines'].count().sort_values(ascending=False)

CarNumber
Y7689C197RUS    4
92928M178RUS    4
7788KT197RUS    4
M0299X197RUS    3
O718MM163RUS    3
               ..
M680T9152RUS    1
M6977E152RUS    1
M701T9152RUS    1
M741T9152RUS    1
704687163RUS    1
Name: Fines, Length: 531, dtype: int64

In [29]:
# select from the initial dataframe all the rows corresponding to the top-1 car number,
# we want to zoom in a little bit
top_1 = df.groupby('CarNumber')['Fines'].count().sort_values(ascending=False).idxmax()
df.query('CarNumber == @top_1')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
183,Y7689C197RUS,1,27000.0,Ford,Focus
531,Y7689C197RUS,2,9000.0,Ford,Focus
554,Y7689C197RUS,2,45000.0,Ford,Focus
665,Y7689C197RUS,1,36000.0,Ford,Focus


In [27]:
# display the car numbers grouped by the sum of the fines in the descending order,
# we want to find those who paid the most
df.groupby('CarNumber')['Fines'].sum().sort_values(ascending=False)

CarNumber
X758HY197RUS    242000.0
9020YC197RUS    217500.0
M0279X197RUS    216000.0
Y352O8197RUS    207200.0
Y778EE197RUS    192000.0
                  ...   
83218C154RUS       100.0
Y166O8161RUS       100.0
705787163RUS       100.0
C58078163RUS       100.0
Y195O8161RUS       100.0
Name: Fines, Length: 531, dtype: float64

In [30]:
#select from the initial dataframe all the rows corresponding to the top-1 car number,
# we want to zoom in a little bit
top_1 = df.groupby('CarNumber')['Fines'].sum().sort_values(ascending=False).idxmax()
df.query('CarNumber == @top_1')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
98,X758HY197RUS,2,24200.0,Ford,Focus
158,X758HY197RUS,2,72600.0,Ford,Focus
401,X758HY197RUS,2,145200.0,Ford,Focus


In [49]:
# display the table that answers the question: are there any car number that
# was connected to different models?
number_of_models_for_number = df.groupby('CarNumber')[['Model', 'Make']].nunique().apply(max, axis=1)
# number_of_models_for_number[number_of_models_for_number != 1]
(number_of_models_for_number != 1).sum()

0

In [50]:
df.loc[df['Model'].isin(model_list)]

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.000000,Ford,Focus
2,7184TT36RUS,1,2100.000000,Ford,Focus
3,X582HE161RUS,2,2000.000000,Ford,Focus
4,92918M178RUS,1,5700.000000,Ford,Focus
5,H234YH197RUS,2,6000.000000,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.000000,Ford,Focus
721,M0309X197RUS,1,22300.000000,Ford,Focus
722,O673E8197RUS,2,600.000000,Ford,Focus
723,8610T8154RUS,1,2000.000000,Ford,Focus
