In [42]:
import pandas as pd 
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [43]:
data = pd.read_csv('payment-practices.csv')

In [44]:
data.drop(columns=['Report Id','Start date', 'End date', 'Filing date', 'Company',
       'Company number','URL','Payments made in the reporting period'],axis=1,inplace=True)

In [45]:
data.columns

Index(['Average time to pay', '% Invoices paid within 30 days',
       '% Invoices paid between 31 and 60 days',
       '% Invoices paid later than 60 days',
       '% Invoices not paid within agreed terms',
       'Shortest (or only) standard payment period',
       'Longest standard payment period', 'Maximum contractual payment period',
       'Payment terms have changed', 'Suppliers notified of changes',
       'Participates in payment codes', 'E-Invoicing offered',
       'Supply-chain financing offered',
       'Policy covers charges for remaining on supplier list',
       'Charges have been made for remaining on supplier list'],
      dtype='object')

In [46]:
data

Unnamed: 0,Average time to pay,% Invoices paid within 30 days,% Invoices paid between 31 and 60 days,% Invoices paid later than 60 days,% Invoices not paid within agreed terms,Shortest (or only) standard payment period,Longest standard payment period,Maximum contractual payment period,Payment terms have changed,Suppliers notified of changes,Participates in payment codes,E-Invoicing offered,Supply-chain financing offered,Policy covers charges for remaining on supplier list,Charges have been made for remaining on supplier list
0,25.0,77.0,20.0,3.0,89.0,30.0,60.0,60.0,False,,False,True,False,False,False
1,69.0,7.0,28.0,65.0,0.0,30.0,,75.0,False,,False,False,False,False,False
2,21.0,89.0,9.0,2.0,47.0,2.0,30.0,30.0,False,,False,False,False,True,True
3,14.0,90.0,8.0,2.0,31.0,2.0,30.0,30.0,False,,False,False,False,True,True
4,15.0,96.0,2.0,2.0,14.0,2.0,30.0,30.0,False,,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77676,27.0,62.0,37.0,1.0,52.0,0.0,,60.0,False,,False,False,False,False,False
77677,15.0,96.0,2.0,2.0,7.0,1.0,45.0,40.0,False,,False,False,False,False,False
77678,17.0,91.0,9.0,0.0,9.0,30.0,,60.0,False,,False,True,False,False,False
77679,27.0,93.0,6.0,1.0,7.0,30.0,,60.0,False,,True,True,False,False,False


In [47]:
missing_data = pd.DataFrame({'total_missing': data.isnull().sum(), 'perc_missing': (data.isnull().sum()/77681)*100})
missing_data

Unnamed: 0,total_missing,perc_missing
Average time to pay,6859,8.829701
% Invoices paid within 30 days,6859,8.829701
% Invoices paid between 31 and 60 days,6859,8.829701
% Invoices paid later than 60 days,6859,8.829701
% Invoices not paid within agreed terms,6708,8.635316
Shortest (or only) standard payment period,6708,8.635316
Longest standard payment period,30562,39.342954
Maximum contractual payment period,6708,8.635316
Payment terms have changed,6708,8.635316
Suppliers notified of changes,76016,97.856619


In [48]:
data.describe()

Unnamed: 0,Average time to pay,% Invoices paid within 30 days,% Invoices paid between 31 and 60 days,% Invoices paid later than 60 days,% Invoices not paid within agreed terms,Shortest (or only) standard payment period,Longest standard payment period,Maximum contractual payment period
count,70822.0,70822.0,70822.0,70822.0,70973.0,70973.0,47119.0,70973.0
mean,36.6813,55.978029,30.119553,13.896134,27.819875,20.522241,70.820073,73.439266
std,23.010175,28.461688,20.59025,16.762054,23.69779,21.603915,46.30543,86.101648
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,24.0,32.0,13.0,3.0,9.0,1.0,60.0,45.0
50%,34.0,58.0,28.0,7.0,22.0,14.0,60.0,60.0
75%,46.0,81.0,45.0,18.0,41.0,30.0,90.0,90.0
max,1120.0,100.0,100.0,100.0,100.0,1000.0,1264.0,5475.0


In [49]:
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = ['Company_number', 'Company_number', 'Company', 'Payment_terms_have_changed', 'Suppliers_notified_of_changes', 'E-Invoicing_offered', 'Supply-chain_financing_offered']

In [51]:
for var in numerical_cols:
    data[var].fillna(data[var].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[var].fillna(data[var].mean(), inplace=True)


In [52]:
missing_data = pd.DataFrame({'total_missing': data.isnull().sum(), 'perc_missing': (data.isnull().sum()/77681)*100})
missing_data

Unnamed: 0,total_missing,perc_missing
Average time to pay,0,0.0
% Invoices paid within 30 days,0,0.0
% Invoices paid between 31 and 60 days,0,0.0
% Invoices paid later than 60 days,0,0.0
% Invoices not paid within agreed terms,0,0.0
Shortest (or only) standard payment period,0,0.0
Longest standard payment period,0,0.0
Maximum contractual payment period,0,0.0
Payment terms have changed,6708,8.635316
Suppliers notified of changes,76016,97.856619
