In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Dataset Info

In [54]:
data = pd.read_csv('payment-practices.csv')
data.head()

Unnamed: 0,Report Id,Start date,End date,Filing date,Company,Company number,Payments made in the reporting period,Average time to pay,% Invoices paid within 30 days,% Invoices paid between 31 and 60 days,...,Longest standard payment period,Maximum contractual payment period,Payment terms have changed,Suppliers notified of changes,Participates in payment codes,E-Invoicing offered,Supply-chain financing offered,Policy covers charges for remaining on supplier list,Charges have been made for remaining on supplier list,URL
0,2,2017-04-29,2017-10-28,2017-11-07,MEDTRONIC LIMITED,1070807,,25.0,77.0,20.0,...,60.0,60.0,False,,False,True,False,False,False,https://check-payment-practices.service.gov.uk...
1,3,2017-05-01,2017-10-31,2017-11-08,SEBDEN STEEL SERVICE CENTRES LIMITED,2553464,,69.0,7.0,28.0,...,,75.0,False,,False,False,False,False,False,https://check-payment-practices.service.gov.uk...
2,4,2017-04-15,2017-10-14,2017-11-08,GREENERGY FLEXIGRID LIMITED,7581532,,21.0,89.0,9.0,...,30.0,30.0,False,,False,False,False,True,True,https://check-payment-practices.service.gov.uk...
3,5,2017-04-15,2017-10-14,2017-11-08,GREENERGY FUELS LIMITED,4058825,,14.0,90.0,8.0,...,30.0,30.0,False,,False,False,False,True,True,https://check-payment-practices.service.gov.uk...
4,6,2017-04-15,2017-10-14,2017-11-08,GREENERGY INTERNATIONAL LIMITED,2809935,,15.0,96.0,2.0,...,30.0,30.0,False,,False,False,False,True,False,https://check-payment-practices.service.gov.uk...


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77681 entries, 0 to 77680
Data columns (total 23 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   Report Id                                              77681 non-null  int64  
 1   Start date                                             77681 non-null  object 
 2   End date                                               77681 non-null  object 
 3   Filing date                                            77681 non-null  object 
 4   Company                                                77681 non-null  object 
 5   Company number                                         77681 non-null  object 
 6   Payments made in the reporting period                  63008 non-null  object 
 7   Average time to pay                                    70822 non-null  float64
 8   % Invoices paid within 30 days                

In [56]:
data.describe()

Unnamed: 0,Report Id,Average time to pay,% Invoices paid within 30 days,% Invoices paid between 31 and 60 days,% Invoices paid later than 60 days,% Invoices not paid within agreed terms,Shortest (or only) standard payment period,Longest standard payment period,Maximum contractual payment period
count,77681.0,70822.0,70822.0,70822.0,70822.0,70973.0,70973.0,47119.0,70973.0
mean,39683.118935,36.6813,55.978029,30.119553,13.896134,27.819875,20.522241,70.820073,73.439266
std,22812.737305,23.010175,28.461688,20.59025,16.762054,23.69779,21.603915,46.30543,86.101648
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,19956.0,24.0,32.0,13.0,3.0,9.0,1.0,60.0,45.0
50%,39765.0,34.0,58.0,28.0,7.0,22.0,14.0,60.0,60.0
75%,59430.0,46.0,81.0,45.0,18.0,41.0,30.0,90.0,90.0
max,79107.0,1120.0,100.0,100.0,100.0,100.0,1000.0,1264.0,5475.0


In [57]:
data.isnull().sum()

Report Id                                                    0
Start date                                                   0
End date                                                     0
Filing date                                                  0
Company                                                      0
Company number                                               0
Payments made in the reporting period                    14673
Average time to pay                                       6859
% Invoices paid within 30 days                            6859
% Invoices paid between 31 and 60 days                    6859
% Invoices paid later than 60 days                        6859
% Invoices not paid within agreed terms                   6708
Shortest (or only) standard payment period                6708
Longest standard payment period                          30562
Maximum contractual payment period                        6708
Payment terms have changed                             

Preprocessing

In [58]:
date_columns = ['Start date', 'End date', 'Filing date']
data[date_columns] = data[date_columns].apply(pd.to_datetime, format='%Y-%m-%d', errors='coerce')

In [59]:
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = ['Company number', 'Payment terms have changed', 'Suppliers notified of changes', 'E-Invoicing offered', 'Supply-chain financing offered']

In [60]:
data.dropna()

Unnamed: 0,Report Id,Start date,End date,Filing date,Company,Company number,Payments made in the reporting period,Average time to pay,% Invoices paid within 30 days,% Invoices paid between 31 and 60 days,...,Longest standard payment period,Maximum contractual payment period,Payment terms have changed,Suppliers notified of changes,Participates in payment codes,E-Invoicing offered,Supply-chain financing offered,Policy covers charges for remaining on supplier list,Charges have been made for remaining on supplier list,URL
8941,9213,2018-06-01,2018-12-01,2018-12-20,PZ CUSSONS BEAUTY LLP,OC364213,True,68.0,22.0,30.0,...,75.0,75.0,True,True,False,False,False,False,False,https://check-payment-practices.service.gov.uk...
8942,9214,2018-06-01,2018-12-01,2018-12-20,PZ CUSSONS (UK) LIMITED,00748096,True,73.0,12.0,32.0,...,75.0,75.0,True,True,False,False,False,False,False,https://check-payment-practices.service.gov.uk...
9040,9319,2018-06-01,2018-11-30,2019-01-02,AMINO COMMUNICATIONS LIMITED,03490180,True,24.0,74.0,23.0,...,60.0,60.0,True,True,False,False,False,False,False,https://check-payment-practices.service.gov.uk...
9175,9457,2018-06-30,2018-12-29,2019-01-14,CREED FOODSERVICE LIMITED,01065559,True,37.0,45.0,44.0,...,75.0,75.0,True,True,False,False,False,False,False,https://check-payment-practices.service.gov.uk...
9214,9496,2018-07-01,2018-12-31,2019-01-15,ROCHE DIAGNOSTICS LIMITED,00571546,True,34.0,66.0,21.0,...,60.0,90.0,True,True,False,False,False,False,False,https://check-payment-practices.service.gov.uk...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77161,78587,2023-07-01,2023-12-31,2024-01-17,EURO CAR PARTS LIMITED,02680212,True,51.0,42.0,30.0,...,180.0,270.0,True,True,False,True,True,False,False,https://check-payment-practices.service.gov.uk...
77267,78694,2023-07-01,2023-12-31,2024-01-18,ASTON MARTIN LAGONDA LIMITED,01199255,True,58.0,14.0,61.0,...,120.0,120.0,True,True,False,False,True,False,False,https://check-payment-practices.service.gov.uk...
77380,78807,2023-07-01,2023-12-31,2024-01-22,LEICA BIOSYSTEMS NEWCASTLE LIMITED,02163063,True,59.0,17.0,50.0,...,90.0,90.0,True,True,False,False,True,False,False,https://check-payment-practices.service.gov.uk...
77590,79017,2023-07-01,2023-12-31,2024-01-23,BELMONT GREEN FINANCE LIMITED,09837692,True,17.0,97.0,3.0,...,45.0,30.0,True,False,False,False,False,False,False,https://check-payment-practices.service.gov.uk...


In [61]:
data.isnull().sum()

Report Id                                                    0
Start date                                                   9
End date                                                     0
Filing date                                                  0
Company                                                      0
Company number                                               0
Payments made in the reporting period                    14673
Average time to pay                                       6859
% Invoices paid within 30 days                            6859
% Invoices paid between 31 and 60 days                    6859
% Invoices paid later than 60 days                        6859
% Invoices not paid within agreed terms                   6708
Shortest (or only) standard payment period                6708
Longest standard payment period                          30562
Maximum contractual payment period                        6708
Payment terms have changed                             