# Exercise 02 : Preprocessing
## Required data

In [1]:
%ls ../data/auto.csv

../data/auto.csv


## Imports

In [2]:
import pandas as pd

## Read the data

In [3]:
df = pd.read_csv('../data/auto.csv',
                 index_col='ID'
                )
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
4,E34877152RUS,Ford Focus,2.0,6100.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


In [4]:
df.count()

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

## Drop duplicates by `CarNumber`, `Make_n_model` and `Fines`

In [5]:
df.drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'],
                   keep='last',
                   inplace=True
                  )

df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
5,92918M178RUS,Ford Focus,1.0,5700.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


In [6]:
df.count()

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

## Work with missing values
### Count `NaN` values in each column

In [7]:
df.isna().sum(axis='index')

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

### Drop column with over 500 missing values 

In [8]:
df.dropna(axis='columns',
          thresh=df.shape[0] - 500,
          inplace=True
         )

df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0
1,E432XX77RUS,Toyota Camry,1.0,6500.0
2,7184TT36RUS,Ford Focus,1.0,2100.0
3,X582HE161RUS,Ford Focus,2.0,2000.0
5,92918M178RUS,Ford Focus,1.0,5700.0
...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0
927,M0309X197RUS,Ford Focus,1.0,22300.0
928,O673E8197RUS,Ford Focus,2.0,600.0
929,8610T8154RUS,Ford Focus,1.0,2000.0


In [9]:
df.isna().sum(axis='index')

CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

###  Replace all the missing values in the `Refund` column with the previous value in that column for that cell

In [10]:
df['Refund'].fillna(
    method='ffill',
    inplace=True
)

df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0
1,E432XX77RUS,Toyota Camry,1.0,6500.0
2,7184TT36RUS,Ford Focus,1.0,2100.0
3,X582HE161RUS,Ford Focus,2.0,2000.0
5,92918M178RUS,Ford Focus,1.0,5700.0
...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0
927,M0309X197RUS,Ford Focus,1.0,22300.0
928,O673E8197RUS,Ford Focus,2.0,600.0
929,8610T8154RUS,Ford Focus,1.0,2000.0


In [11]:
df.isna().sum(axis='index')

CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

### Replace all the missing values in the `Fines` column with the mean value of this column 

In [12]:
mean_value = df['Fines'].mean(skipna=True)
mean_value

8594.586466165414

In [13]:
df['Fines'].fillna(
    value=mean_value,
    inplace=True
)

df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.000000
1,E432XX77RUS,Toyota Camry,1.0,6500.000000
2,7184TT36RUS,Ford Focus,1.0,2100.000000
3,X582HE161RUS,Ford Focus,2.0,2000.000000
5,92918M178RUS,Ford Focus,1.0,5700.000000
...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.000000
927,M0309X197RUS,Ford Focus,1.0,22300.000000
928,O673E8197RUS,Ford Focus,2.0,600.000000
929,8610T8154RUS,Ford Focus,1.0,2000.000000


In [14]:
df.isna().sum(axis='index')

CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

### Split and parse the make and model

In [15]:
df[['Make', 'Model']] = pd.DataFrame(df['Make_n_model'].apply(str.split).tolist(),
                                     index=df.index
                                    )
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,Toyota Camry,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,Ford Focus,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,Ford Focus,2.0,2000.000000,Ford,Focus
5,92918M178RUS,Ford Focus,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,Ford Focus,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,Ford Focus,2.0,600.000000,Ford,Focus
929,8610T8154RUS,Ford Focus,1.0,2000.000000,Ford,Focus


In [16]:
df.drop('Make_n_model', 
        axis='columns', 
        inplace=True
       )
df

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,2.0,2000.000000,Ford,Focus
5,92918M178RUS,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...
926,Y163O8161RUS,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,2.0,600.000000,Ford,Focus
929,8610T8154RUS,1.0,2000.000000,Ford,Focus


## Save the dataframe in the JSON file

In [17]:
df.to_json('../data/auto.json',
           orient='records'
          )

In [18]:
%cat ../data/auto.json

[{"CarNumber":"Y163O8161RUS","Refund":2.0,"Fines":3200.0,"Make":"Ford","Model":"Focus"},{"CarNumber":"E432XX77RUS","Refund":1.0,"Fines":6500.0,"Make":"Toyota","Model":"Camry"},{"CarNumber":"7184TT36RUS","Refund":1.0,"Fines":2100.0,"Make":"Ford","Model":"Focus"},{"CarNumber":"X582HE161RUS","Refund":2.0,"Fines":2000.0,"Make":"Ford","Model":"Focus"},{"CarNumber":"92918M178RUS","Refund":1.0,"Fines":5700.0,"Make":"Ford","Model":"Focus"},{"CarNumber":"H234YH197RUS","Refund":2.0,"Fines":6000.0,"Make":"Ford","Model":"Focus"},{"CarNumber":"E40577152RUS","Refund":1.0,"Fines":8594.5864661654,"Make":"Ford","Model":"Focus"},{"CarNumber":"707987163RUS","Refund":2.0,"Fines":2200.0,"Make":"Ford","Model":"Focus"},{"CarNumber":"K330T8197RUS","Refund":2.0,"Fines":8200.0,"Make":"Skoda","Model":"Octavia"},{"CarNumber":"X786CO96RUS","Refund":1.0,"Fines":8594.5864661654,"Make":"Ford","Model":"Focus"},{"CarNumber":"C477M7161RUS","Refund":1.0,"Fines":2500.0,"Make":"Ford","Model":"Focus"},{"CarNumber":"O2199719