In [362]:
import pandas as pd 
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as pyplot

Data Preprocessing

In [363]:
data = pd.read_csv('payment-practices.csv')

In [372]:
data.head()

Unnamed: 0,Average time to pay,% Invoices paid within 30 days,% Invoices paid between 31 and 60 days,% Invoices paid later than 60 days,% Invoices not paid within agreed terms,Shortest (or only) standard payment period,Longest standard payment period,Maximum contractual payment period,Payment terms have changed,Suppliers notified of changes,Participates in payment codes,E-Invoicing offered,Supply-chain financing offered,Policy covers charges for remaining on supplier list,Charges have been made for remaining on supplier list
0,25.0,77.0,20.0,3.0,89.0,30.0,60.0,60.0,False,True,False,True,False,False,False
1,69.0,7.0,28.0,65.0,0.0,30.0,70.820073,75.0,False,True,False,False,False,False,False
2,21.0,89.0,9.0,2.0,47.0,2.0,30.0,30.0,False,True,False,False,False,True,True
3,14.0,90.0,8.0,2.0,31.0,2.0,30.0,30.0,False,True,False,False,False,True,True
4,15.0,96.0,2.0,2.0,14.0,2.0,30.0,30.0,False,False,False,False,False,True,False


In [373]:
data.describe()

Unnamed: 0,Average time to pay,% Invoices paid within 30 days,% Invoices paid between 31 and 60 days,% Invoices paid later than 60 days,% Invoices not paid within agreed terms,Shortest (or only) standard payment period,Longest standard payment period,Maximum contractual payment period
count,77681.0,77681.0,77681.0,77681.0,77681.0,77681.0,77681.0,77681.0
mean,36.6813,55.978029,30.119553,13.896134,27.819875,20.522241,70.820073,73.439266
std,21.970824,27.176096,19.660205,16.004926,22.651489,20.650062,36.063737,82.300103
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,25.0,34.0,14.0,3.0,10.0,3.0,60.0,45.0
50%,36.0,55.978029,30.119553,9.0,25.0,20.522241,70.820073,60.0
75%,45.0,79.0,43.0,17.0,39.0,30.0,70.820073,90.0
max,1120.0,100.0,100.0,100.0,100.0,1000.0,1264.0,5475.0


In [364]:
data.drop(columns=['Report Id','Start date', 'End date', 'Filing date', 'Company',
       'Company number','URL','Payments made in the reporting period'],axis=1,inplace=True)

In [365]:
missing_data = pd.DataFrame({'total_missing': data.isnull().sum(), 'perc_missing': (data.isnull().mean())*100})
missing_data

Unnamed: 0,total_missing,perc_missing
Average time to pay,6859,8.829701
% Invoices paid within 30 days,6859,8.829701
% Invoices paid between 31 and 60 days,6859,8.829701
% Invoices paid later than 60 days,6859,8.829701
% Invoices not paid within agreed terms,6708,8.635316
Shortest (or only) standard payment period,6708,8.635316
Longest standard payment period,30562,39.342954
Maximum contractual payment period,6708,8.635316
Payment terms have changed,6708,8.635316
Suppliers notified of changes,76016,97.856619


In [366]:
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = ['Payment_terms_have_changed', 'Suppliers_notified_of_changes','Participates in payment codes' , 'E-Invoicing_offered', 'Supply-chain_financing_offered','Policy covers charges for remaining on supplier list','Charges have been made for remaining on supplier list']
data = data.astype({'Participates in payment codes':'bool'})

In [367]:
for var in numerical_cols:
    data[var].fillna(data[var].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[var].fillna(data[var].mean(), inplace=True)


In [368]:
missing_data = pd.DataFrame({'total_missing': data.isnull().sum(), 'perc_missing': (data.isnull().mean())*100})
missing_data

Unnamed: 0,total_missing,perc_missing
Average time to pay,0,0.0
% Invoices paid within 30 days,0,0.0
% Invoices paid between 31 and 60 days,0,0.0
% Invoices paid later than 60 days,0,0.0
% Invoices not paid within agreed terms,0,0.0
Shortest (or only) standard payment period,0,0.0
Longest standard payment period,0,0.0
Maximum contractual payment period,0,0.0
Payment terms have changed,6708,8.635316
Suppliers notified of changes,76016,97.856619


In [369]:
missing_data_cols = ['Payment terms have changed', 'Suppliers notified of changes', 'E-Invoicing offered', 'Supply-chain financing offered','Policy covers charges for remaining on supplier list','Charges have been made for remaining on supplier list']

In [370]:
for i in missing_data_cols:
    data_complete = data.drop(columns=missing_data_cols,axis=1)
    data_complete[i] = data[i]
    data_incomplete = data_complete[data_complete.isnull().any(axis=1)]
    data_complete.dropna(inplace=True)
    data_complete = data_complete.astype({i:'bool'})
    X_train = data_complete.drop(columns=[i])
    y_train = data_complete[i]
    model = LogisticRegression()
    model.fit(X_train, y_train)
    predicted_values = model.predict(data_incomplete.drop(columns=[i]))
    data_incomplete[i] = predicted_values
    data.update(data_incomplete)

  data.update(data_incomplete)
  data.update(data_incomplete)
  data.update(data_incomplete)
  data.update(data_incomplete)
  data.update(data_incomplete)
  data.update(data_incomplete)


In [371]:
print(data.isnull().sum())

Average time to pay                                      0
% Invoices paid within 30 days                           0
% Invoices paid between 31 and 60 days                   0
% Invoices paid later than 60 days                       0
% Invoices not paid within agreed terms                  0
Shortest (or only) standard payment period               0
Longest standard payment period                          0
Maximum contractual payment period                       0
Payment terms have changed                               0
Suppliers notified of changes                            0
Participates in payment codes                            0
E-Invoicing offered                                      0
Supply-chain financing offered                           0
Policy covers charges for remaining on supplier list     0
Charges have been made for remaining on supplier list    0
dtype: int64
