##### Load libs:

In [6]:
import pandas as pd
import numpy as np
import requests

##### 1.Load data:

In [7]:
pd.options.display.float_format = '{:.2f}'.format
data = pd.read_json('../data/auto.json')
print(data)

        CarNumber  Refund    Fines    Make    Model
0    Y163O8161RUS       2  3200.00    Ford    Focus
1     E432XX77RUS       1  6500.00  Toyota    Camry
2     7184TT36RUS       1  2100.00    Ford    Focus
3    X582HE161RUS       2  2000.00    Ford    Focus
4    92918M178RUS       1  5700.00    Ford    Focus
..            ...     ...      ...     ...      ...
720  Y163O8161RUS       2  1600.00    Ford    Focus
721  M0309X197RUS       1 22300.00    Ford    Focus
722  O673E8197RUS       2   600.00    Ford    Focus
723  8610T8154RUS       1  2000.00    Ford    Focus
724  H419XE197RUS       2  8594.59  Toyota  Corolla

[725 rows x 5 columns]


##### 2.New DataFrame:

In [8]:
np.random.seed(21)
sample = data.sample(n=200, random_state=21)

sample['Refund'] = np.random.choice(data['Refund'].dropna(), size=200)
sample['Fines'] = np.random.choice(data['Fines'].dropna(), size=200)

concat_rows = pd.concat([data, sample], ignore_index=True)
print(f"\nConcat DataFrame: {concat_rows}")
print(concat_rows.info())


Concat DataFrame:         CarNumber  Refund    Fines        Make   Model
0    Y163O8161RUS       2  3200.00        Ford   Focus
1     E432XX77RUS       1  6500.00      Toyota   Camry
2     7184TT36RUS       1  2100.00        Ford   Focus
3    X582HE161RUS       2  2000.00        Ford   Focus
4    92918M178RUS       1  5700.00        Ford   Focus
..            ...     ...      ...         ...     ...
920  8182XX154RUS       1   500.00        Ford   Focus
921   X796TH96RUS       2 40600.00        Ford   Focus
922  T011MY163RUS       1  2300.00        Ford   Focus
923   T341CC96RUS       2  7100.00  Volkswagen  Passat
924   T119CT96RUS       2   200.00        Ford   Focus

[925 rows x 5 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925 entries, 0 to 924
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CarNumber  925 non-null    object 
 1   Refund     925 non-null    int64  
 2   Fines      925 non-null    f

##### 3.DataFrame with year:

In [9]:
years = pd.Series(np.random.randint(1980, 2020, size=len(concat_rows)), name='Year')
fines = pd.concat([concat_rows, years], axis=1)
print(fines)

        CarNumber  Refund    Fines        Make   Model  Year
0    Y163O8161RUS       2  3200.00        Ford   Focus  2013
1     E432XX77RUS       1  6500.00      Toyota   Camry  2000
2     7184TT36RUS       1  2100.00        Ford   Focus  2000
3    X582HE161RUS       2  2000.00        Ford   Focus  1982
4    92918M178RUS       1  5700.00        Ford   Focus  1994
..            ...     ...      ...         ...     ...   ...
920  8182XX154RUS       1   500.00        Ford   Focus  1980
921   X796TH96RUS       2 40600.00        Ford   Focus  2001
922  T011MY163RUS       1  2300.00        Ford   Focus  2015
923   T341CC96RUS       2  7100.00  Volkswagen  Passat  2013
924   T119CT96RUS       2   200.00        Ford   Focus  1997

[925 rows x 6 columns]


##### 4.New DataFrame with help surname.json:

Load data surname.json:

In [10]:
np.random.seed(21)
surnames_data = pd.read_json(
	'../../datasets/surname.json',
	
)
surnames_data.columns=surnames_data.iloc[0].to_list()
surnames_data=surnames_data.drop(0)
print(f"---Most popular surname---\n{surnames_data.sort_values('COUNT', key=lambda x: x.astype(int), ascending=False).head(1)}")

selected_surnames = np.random.choice(surnames_data['NAME'].to_list(), size=len(set(data['CarNumber'].to_list())), replace=True)

owners = pd.DataFrame({"CarNumber": list(set(data['CarNumber'].to_list())), "SURNAME": selected_surnames})
print(f"\n---OWNERS---\n{owners}")

---Most popular surname---
     NAME    COUNT RANK
85  SMITH  2442977    1

---OWNERS---
        CarNumber     SURNAME
0    T399KX197RUS  RICHARDSON
1    9491H7178RUS        ROSS
2     E811HE77RUS      MORGAN
3    7065C8197RUS      BAILEY
4    7368C8197RUS       LOPEZ
..            ...         ...
526  T018OX197RUS    CAMPBELL
527  7363C8197RUS        HALL
528   Y156E877RUS       BAKER
529  8165XX154RUS        DIAZ
530   9502XX38RUS      MORGAN

[531 rows x 2 columns]


Add 5 string:

In [11]:
fines = pd.concat(
	[
		pd.DataFrame(
			[
				{'CarNumber': 'ABC123RUS', 'Refund': 1.0, 'Fines': 2500.0, 'Make': 'Toyota', 'Model': 'Camry', 'Year': 2015},
				{'CarNumber': 'XYZ789RUS', 'Refund': 2.0, 'Fines': 1800.0, 'Make': 'Honda', 'Model': 'Civic', 'Year': 2018},
				{'CarNumber': 'DEF456RUS', 'Refund': 1.0, 'Fines': 3200.0, 'Make': 'Ford', 'Model': 'Focus', 'Year': 2016},
				{'CarNumber': 'GHI789RUS', 'Refund': 2.0, 'Fines': 1500.0, 'Make': 'Nissan', 'Model': 'Altima', 'Year': 2019},
				{'CarNumber': 'JKL012RUS', 'Refund': 1.0, 'Fines': 2800.0, 'Make': 'Chevrolet', 'Model': 'Malibu', 'Year': 2017}
			]
		),
		fines
	],
	ignore_index=True
)
print(fines)

        CarNumber  Refund    Fines        Make   Model  Year
0       ABC123RUS    1.00  2500.00      Toyota   Camry  2015
1       XYZ789RUS    2.00  1800.00       Honda   Civic  2018
2       DEF456RUS    1.00  3200.00        Ford   Focus  2016
3       GHI789RUS    2.00  1500.00      Nissan  Altima  2019
4       JKL012RUS    1.00  2800.00   Chevrolet  Malibu  2017
..            ...     ...      ...         ...     ...   ...
925  8182XX154RUS    1.00   500.00        Ford   Focus  1980
926   X796TH96RUS    2.00 40600.00        Ford   Focus  2001
927  T011MY163RUS    1.00  2300.00        Ford   Focus  2015
928   T341CC96RUS    2.00  7100.00  Volkswagen  Passat  2013
929   T119CT96RUS    2.00   200.00        Ford   Focus  1997

[930 rows x 6 columns]


Del 20 and add 3 string in owners:

In [12]:
owners = owners.iloc[:-20]
owners = pd.concat(
    [
        pd.DataFrame([
            {'CarNumber': 'MNO555RUS', 'SURNAME': 'Smith'},
            {'CarNumber': 'PQR999RUS', 'SURNAME': 'Johnson'},
            {'CarNumber': 'STU111RUS', 'SURNAME': 'Williams'}
        ]),
        owners
    ],
    ignore_index=True
)
print(owners)

        CarNumber     SURNAME
0       MNO555RUS       Smith
1       PQR999RUS     Johnson
2       STU111RUS    Williams
3    T399KX197RUS  RICHARDSON
4    9491H7178RUS        ROSS
..            ...         ...
509   Y7129Y50RUS   HERNANDEZ
510  7831C8197RUS       BAKER
511  T6428M116RUS      MARTIN
512  9470EX178RUS      WRIGHT
513   T6439O50RUS        HILL

[514 rows x 2 columns]


Merge DataFrames:

In [13]:
inner_merge = pd.merge(fines, owners, on='CarNumber', how='inner')
print(f"---Inner join---\n{inner_merge.shape}")

outer_merge = pd.merge(fines, owners, on='CarNumber', how='outer')
print(f"\n---Full join---\n{outer_merge.shape}")

left_merge = pd.merge(fines, owners, on='CarNumber', how='left')
print(f"\n---Left join---\n{left_merge.shape}")

right_merge = pd.merge(fines, owners, on='CarNumber', how='right')
print(f"\nRight join---\n{right_merge.shape}")

---Inner join---
(890, 7)

---Full join---
(933, 7)

---Left join---
(930, 7)

Right join---
(893, 7)


##### 5.New DataFrame by year:

In [14]:
pivot_table = fines.pivot_table(
    values='Fines',
    index=['Make','Model'],
    columns='Year',
    aggfunc='sum',
    fill_value=0
).replace(0.0, 'nan')

print(pivot_table)

Year                    1980     1981      1982      1983     1984      1985  \
Make       Model                                                               
Chevrolet  Malibu        nan      nan       nan       nan      nan       nan   
Ford       Focus   188794.59 56994.59 117000.00 157800.00 80700.00 153378.35   
           Mondeo        nan      nan       nan       nan      nan       nan   
Honda      Civic         nan      nan       nan       nan      nan       nan   
Nissan     Altima        nan      nan       nan       nan      nan       nan   
Skoda      Octavia       nan 10094.59       nan   8200.00 13694.59   3900.00   
Toyota     Camry    19800.00   800.00       nan       nan      nan   9494.59   
           Corolla  16000.00 12100.00       nan       nan      nan  24000.00   
Volkswagen Golf          nan      nan   4600.00   2600.00      nan       nan   
           Jetta         nan  9000.00       nan       nan      nan   1000.00   
           Passat        nan  8594.59   

##### 6.Save DataFrames:

In [15]:
print(fines)
print(owners)

fines.to_csv('../data/fines.csv',index=False)
owners.to_csv('../data/owners.csv',index=False)

        CarNumber  Refund    Fines        Make   Model  Year
0       ABC123RUS    1.00  2500.00      Toyota   Camry  2015
1       XYZ789RUS    2.00  1800.00       Honda   Civic  2018
2       DEF456RUS    1.00  3200.00        Ford   Focus  2016
3       GHI789RUS    2.00  1500.00      Nissan  Altima  2019
4       JKL012RUS    1.00  2800.00   Chevrolet  Malibu  2017
..            ...     ...      ...         ...     ...   ...
925  8182XX154RUS    1.00   500.00        Ford   Focus  1980
926   X796TH96RUS    2.00 40600.00        Ford   Focus  2001
927  T011MY163RUS    1.00  2300.00        Ford   Focus  2015
928   T341CC96RUS    2.00  7100.00  Volkswagen  Passat  2013
929   T119CT96RUS    2.00   200.00        Ford   Focus  1997

[930 rows x 6 columns]
        CarNumber     SURNAME
0       MNO555RUS       Smith
1       PQR999RUS     Johnson
2       STU111RUS    Williams
3    T399KX197RUS  RICHARDSON
4    9491H7178RUS        ROSS
..            ...         ...
509   Y7129Y50RUS   HERNANDEZ
510 

In [16]:
print(fines.count())

CarNumber    930
Refund       930
Fines        930
Make         930
Model        919
Year         930
dtype: int64


In [18]:
print(owners)

        CarNumber     SURNAME
0       MNO555RUS       Smith
1       PQR999RUS     Johnson
2       STU111RUS    Williams
3    T399KX197RUS  RICHARDSON
4    9491H7178RUS        ROSS
..            ...         ...
509   Y7129Y50RUS   HERNANDEZ
510  7831C8197RUS       BAKER
511  T6428M116RUS      MARTIN
512  9470EX178RUS      WRIGHT
513   T6439O50RUS        HILL

[514 rows x 2 columns]


In [19]:
print(fines.count())

CarNumber    930
Refund       930
Fines        930
Make         930
Model        919
Year         930
dtype: int64
