# EX02 Preprocessing

In [136]:
import pandas as pd

## Read the CSV file and make `ID` the index column

In [137]:
df = pd.read_csv('../data/auto.csv', index_col='ID')
df.head()

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
4,E34877152RUS,Ford Focus,2.0,6100.0,


## Count the number of observations using the method *count()*

In [138]:
with_dup = df.count()
with_dup

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

## Drop the duplicates, taking into account only the following columns: `CarNumber`, `Make_n_model`, `Fines`

In [139]:
df[df['CarNumber'] == 'E34877152RUS']

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,E34877152RUS,Ford Focus,2.0,6100.0,
35,E34877152RUS,Ford Focus,2.0,6100.0,
331,E34877152RUS,Ford Focus,2.0,6100.0,
838,E34877152RUS,Ford Focus,2.0,36600.0,3.0


In [140]:
df.drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'], keep='last', inplace=True)
df[df['CarNumber'] == 'E34877152RUS']

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
331,E34877152RUS,Ford Focus,2.0,6100.0,
838,E34877152RUS,Ford Focus,2.0,36600.0,3.0


In [141]:
without_dup = df.count()
without_dup

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

In [142]:
with_dup

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

In [143]:
with_dup - without_dup

CarNumber       206
Make_n_model    206
Refund          201
Fines           204
History          17
dtype: int64

In [144]:
dup_percent = (1 - without_dup.unique() / with_dup.unique()) * 100
dup_percent.round(2)

array([22.13, 21.99, 23.48, 20.73])

## Work with missing values

In [145]:
df.isna().sum()

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

In [146]:
df.dropna(axis=1, thresh=500, inplace=True)
df.isna().sum()

CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

In [147]:
df['Refund'].fillna(method='ffill', inplace=True)
df.isna().sum()

CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

In [148]:
df['Fines'].fillna(df['Fines'].mean(), inplace=True)
df.isna().sum()

CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

In [149]:
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.000000
1,E432XX77RUS,Toyota Camry,1.0,6500.000000
2,7184TT36RUS,Ford Focus,1.0,2100.000000
3,X582HE161RUS,Ford Focus,2.0,2000.000000
5,92918M178RUS,Ford Focus,1.0,5700.000000
...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.000000
927,M0309X197RUS,Ford Focus,1.0,22300.000000
928,O673E8197RUS,Ford Focus,2.0,600.000000
929,8610T8154RUS,Ford Focus,1.0,2000.000000


## Split and parse the `make` and `model`


In [150]:
df['Make_n_model'].unique()

array(['Ford Focus', 'Toyota Camry', 'Skoda Octavia', 'Volkswagen Passat',
       'Volkswagen Golf', 'Volkswagen', 'Volkswagen Jetta',
       'Volkswagen Touareg', 'Toyota Corolla', 'Audi', 'Ford Mondeo',
       'Volvo', 'BMW'], dtype=object)

In [151]:
df['Make'] = df['Make_n_model'].apply(lambda x: x.split()[0])
df['Model'] = df['Make_n_model'].apply(lambda x: x.split()[1] if len(x.split()) == 2 else None)
df.drop('Make_n_model', inplace=True, axis=1)

In [152]:
df.count()

CarNumber    725
Refund       725
Fines        725
Make         725
Model        716
dtype: int64

In [153]:
df.head()

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.0,Ford,Focus
1,E432XX77RUS,1.0,6500.0,Toyota,Camry
2,7184TT36RUS,1.0,2100.0,Ford,Focus
3,X582HE161RUS,2.0,2000.0,Ford,Focus
5,92918M178RUS,1.0,5700.0,Ford,Focus


In [154]:
df.to_json('../data/auto.json', orient='records')
print('File create.')

File create.


In [155]:
!head -c 175 '../data/auto.json'

[{"CarNumber":"Y163O8161RUS","Refund":2.0,"Fines":3200.0,"Make":"Ford","Model":"Focus"},{"CarNumber":"E432XX77RUS","Refund":1.0,"Fines":6500.0,"Make":"Toyota","Model":"Camry"}
