##### Load libs:

In [1]:
import pandas as pd

##### 1.Load data:

In [2]:
data = pd.read_csv(
	'../data/auto.csv',
	sep=',',
	index_col='ID'
)

print(data)

        CarNumber    Make_n_model  Refund    Fines  History
ID                                                         
0    Y163O8161RUS      Ford Focus     2.0   3200.0      NaN
1     E432XX77RUS    Toyota Camry     1.0   6500.0      NaN
2     7184TT36RUS      Ford Focus     1.0   2100.0      NaN
3    X582HE161RUS      Ford Focus     2.0   2000.0      NaN
4    E34877152RUS      Ford Focus     2.0   6100.0      NaN
..            ...             ...     ...      ...      ...
926  Y163O8161RUS      Ford Focus     2.0   1600.0      NaN
927  M0309X197RUS      Ford Focus     1.0  22300.0      NaN
928  O673E8197RUS      Ford Focus     2.0    600.0      NaN
929  8610T8154RUS      Ford Focus     1.0   2000.0      NaN
930  H419XE197RUS  Toyota Corolla     2.0      NaN      2.0

[931 rows x 5 columns]


##### 2.Counts elements:

In [3]:
print(f"Count all elements: {data.count().max()}")

Count all elements: 931


##### 3.Delete duplicate:

In [4]:
data.drop_duplicates(
	subset=['CarNumber', 'Make_n_model', 'Fines'],
	keep='last',
	inplace=True
)
print(f"Count all elements: {data.count().max()}")

Count all elements: 725


##### 4.Work with NaN:

In [5]:
print("Before:")
for col in data.columns:
    print(f"Count NaN in {col}: {data[col].isna().sum()}")

data = data.dropna(axis=1, thresh=len(data) - 500)

print("After 1:")
for col in data.columns:
    print(f"Count NaN in {col}: {data[col].isna().sum()}")

if 'Refund' in data.columns:
    data['Refund'] = data['Refund'].fillna(method="ffill")

if 'Fines' in data.columns:
    mean_fine = data['Fines'].mean()
    data['Fines'] = data['Fines'].fillna(mean_fine)

print(f"mean: {mean_fine}, {data['Refund'].mean()}")

print("After 2:")
for col in data.columns:
    print(f"Count NaN in {col}: {data[col].isna().sum()}")

print(data.head())

Before:
Count NaN in CarNumber: 0
Count NaN in Make_n_model: 0
Count NaN in Refund: 12
Count NaN in Fines: 60
Count NaN in History: 660
After 1:
Count NaN in CarNumber: 0
Count NaN in Make_n_model: 0
Count NaN in Refund: 12
Count NaN in Fines: 60
mean: 8594.586466165414, 1.5172413793103448
After 2:
Count NaN in CarNumber: 0
Count NaN in Make_n_model: 0
Count NaN in Refund: 0
Count NaN in Fines: 0
       CarNumber  Make_n_model  Refund   Fines
ID                                            
0   Y163O8161RUS    Ford Focus     2.0  3200.0
1    E432XX77RUS  Toyota Camry     1.0  6500.0
2    7184TT36RUS    Ford Focus     1.0  2100.0
3   X582HE161RUS    Ford Focus     2.0  2000.0
5   92918M178RUS    Ford Focus     1.0  5700.0


  data['Refund'] = data['Refund'].fillna(method="ffill")


##### 5.Split and parse:

In [6]:
data[['Make','Model']] = data['Make_n_model'].apply(lambda x: pd.Series(x.split(' ')))
data=data.drop('Make_n_model',axis=1)
data.to_json('../data/auto.json', orient='records')

print(data)

        CarNumber  Refund         Fines    Make    Model
ID                                                      
0    Y163O8161RUS     2.0   3200.000000    Ford    Focus
1     E432XX77RUS     1.0   6500.000000  Toyota    Camry
2     7184TT36RUS     1.0   2100.000000    Ford    Focus
3    X582HE161RUS     2.0   2000.000000    Ford    Focus
5    92918M178RUS     1.0   5700.000000    Ford    Focus
..            ...     ...           ...     ...      ...
926  Y163O8161RUS     2.0   1600.000000    Ford    Focus
927  M0309X197RUS     1.0  22300.000000    Ford    Focus
928  O673E8197RUS     2.0    600.000000    Ford    Focus
929  8610T8154RUS     1.0   2000.000000    Ford    Focus
930  H419XE197RUS     2.0   8594.586466  Toyota  Corolla

[725 rows x 5 columns]


In [7]:
print(data.loc[data['Model'].isin(["Focus","Corolla"])])

        CarNumber  Refund         Fines    Make    Model
ID                                                      
0    Y163O8161RUS     2.0   3200.000000    Ford    Focus
2     7184TT36RUS     1.0   2100.000000    Ford    Focus
3    X582HE161RUS     2.0   2000.000000    Ford    Focus
5    92918M178RUS     1.0   5700.000000    Ford    Focus
10   H234YH197RUS     2.0   6000.000000    Ford    Focus
..            ...     ...           ...     ...      ...
926  Y163O8161RUS     2.0   1600.000000    Ford    Focus
927  M0309X197RUS     1.0  22300.000000    Ford    Focus
928  O673E8197RUS     2.0    600.000000    Ford    Focus
929  8610T8154RUS     1.0   2000.000000    Ford    Focus
930  H419XE197RUS     2.0   8594.586466  Toyota  Corolla

[593 rows x 5 columns]


In [8]:
print(data.groupby(['Make', 'Model']).agg('Fines').count())

Make        Model  
Ford        Focus      575
            Mondeo       6
Skoda       Octavia     48
Toyota      Camry       16
            Corolla     18
Volkswagen  Golf        20
            Jetta        6
            Passat      22
            Touareg      5
Name: Fines, dtype: int64
