In [1]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
import category_encoders as ce
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier






# I. Read dataset

In [None]:
df_normalized=pd.read_csv('03_df_normalized_bidnotbid.csv', delimiter=',')
df_normalized.head()

Unnamed: 0,Result_offer,Amount_DE,Amount_HD,Amount_ID,Amount_OTHER,Amount_PP,Amount_PS,Amount_SP,Total_Amount,SR_codigo,Bid_Notbid,Resolution_time,Client_type,Market_segment,Client_BU,Country,Client_importance,Coverage_model,Bid_Notbid_binary
0,Lost,0.001945,0.0,0.0,0.0,0.0,0.0,0.0,0.000718,SR-2468496,Bid,0.078611,Contractor,BDZ,Power Products,ES,MA3,Sales Rep,1
1,Won,0.002045,0.0,0.1,0.0,0.02803,0.020478,0.0,0.021526,SR-2468507,Bid,0.235466,Contractor,E34,INDUSTRIAL AUTOMATION,ES,MS3,Sales Rep,1
2,Lost,0.003382,0.0,0.0,0.0,0.0,5e-06,0.0,0.001252,SR-2468656,Bid,0.032176,Contractor,I9Z,Power Products,ES,MS3,Sales Rep,1
3,Lost,0.003382,0.0,0.0,0.0,0.0,5e-06,0.0,0.001252,SR-2468663,Bid,0.025229,Contractor,I9Z,Power Products,ES,MS3,Sales Rep,1
4,Lost,0.003382,0.0,0.0,0.0,0.0,5e-06,0.0,0.001252,SR-2468668,Bid,0.025229,Contractor,I9Z,Power Products,ES,MS3,Sales Rep,1


# II. Encoding - Predictors / exclude variables

## 1. Encoding

In [3]:
# Visualize unique values per column to check the categories
columns_to_check = [
    'Result_offer',
    'Bid_Notbid',
    'Client_type',
    'Market_segment',
    'Client_BU',
    'Country',
    'Client_importance',
    'Coverage_model',
    'Bid_Notbid_binary'
]
					
for column in columns_to_check:
    unique_values = df_normalized[column].unique()
    unique_count = df_normalized[column].nunique()
    print(f"* Column '{column}':")
    print(f"  - Unique values ({unique_count}): {unique_values}\n")

* Column 'Result_offer':
  - Unique values (3): ['Lost' 'Won' 'Pipeline']

* Column 'Bid_Notbid':
  - Unique values (2): ['Bid' 'Not_bid']

* Column 'Client_type':
  - Unique values (13): ['Contractor' 'B2B Distribution' 'End User' 'Panel Builder' 'Design Firm'
 'Digital & Service Provider' 'Not informed'
 'Original Equipment Manufacturer' 'Electrician' 'System Integrator'
 'IT Channel Partner' 'Internal Schneider Entity' 'DIY & RETAILER']

* Column 'Market_segment':
  - Unique values (129): ['BDZ' 'E34' 'I9Z' 'I44' 'I20' 'IDZ' 'Not informed' 'E42' 'I91' 'E33'
 'E08' 'E07' 'B41' 'I4F' 'E9F' 'I24' 'B47' 'B12' 'B15' 'EN2' 'E09' 'I46'
 'E37' 'BD1' 'I9Y' 'D91' 'EN1' 'B44' 'B17' 'E04' 'E9B' 'E46' 'I37' 'D11'
 'EN3' 'E9D' 'B14' 'I51' 'E05' 'B9A' 'I4B' 'BG1' 'I4A' 'E38' 'R02' 'ID4'
 'R01' 'I31' 'I34' 'IT3' 'BD4' 'I50' 'I40' 'ID2' 'I25' 'B38' 'B9B' 'ID3'
 'B13' 'ID1' 'B93' 'PW1' 'BG2' 'I43' 'E14' 'I28' 'B45' 'E9C' 'BDR' 'I52'
 'I21' 'E01' 'B91' 'B90' 'I60' 'BD3' 'BR4' 'D95' 'I41' 'I48' 'I93' '

In [4]:
#ENCODING

# Copy the dataset
df_encoded = df_normalized.copy()

# Encoding and model preparation

# A. Label Encoding for low cardinality columns
label_columns = ['Country', 'Coverage_model']
label_encoders = {}
for col in label_columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# B. Ordinal Encoding for 'Client_importance'
client_importance_mapping = {
    'MA3': 3, 'MA2': 2, 'MA1': 1,
    'MG3': 3, 'MG2': 2, 'MG1': 1,
    'MS3': 3, 'MS2': 2, 'MS1': 1,
    'Not informed': 0
}
df_encoded['Client_importance'] = df_encoded['Client_importance'].map(client_importance_mapping)

# C. Target Encoding for medium cardinality columns
target_encoder = ce.TargetEncoder(cols=['Client_type'])
df_encoded = target_encoder.fit_transform(df_encoded, df_encoded['Bid_Notbid_binary'])


# D. Frequency Encoding for high cardinality columns
# 1. 'Market_segment'
for col in ['Market_segment']:
    freq_encoding = df_encoded[col].value_counts().to_dict()
    df_encoded[col] = df_encoded[col].map(freq_encoding)

# 2. 'Client_BU'
client_bu_freq = df_encoded['Client_BU'].value_counts().to_dict()
df_encoded['Client_BU'] = df_encoded['Client_BU'].map(client_bu_freq)

# Encode the Result_offer column
df_encoded['Result_offer'] = df_encoded['Result_offer'].map({'Won': 1, 'Lost': -1, 'Pipeline': 0})


# F. Drop unique identifier 'SR_codigo', and 'Result_offer' as it's redundant with 'Bid_Notbid_binary', and Bid_Notbid as it is not a predictor for us
df_encoded = df_encoded.drop(columns=['SR_codigo', 'Bid_Notbid'])

# Display the encoded dataframe
# print(df_encoded.head())
df_encoded.head()

Unnamed: 0,Result_offer,Amount_DE,Amount_HD,Amount_ID,Amount_OTHER,Amount_PP,Amount_PS,Amount_SP,Total_Amount,Resolution_time,Client_type,Market_segment,Client_BU,Country,Client_importance,Coverage_model,Bid_Notbid_binary
0,-1,0.001945,0.0,0.0,0.0,0.0,0.0,0.0,0.000718,0.078611,0.914457,1674,5697,1,3,6,1
1,1,0.002045,0.0,0.1,0.0,0.02803,0.020478,0.0,0.021526,0.235466,0.914457,238,4519,1,3,6,1
2,-1,0.003382,0.0,0.0,0.0,0.0,5e-06,0.0,0.001252,0.032176,0.914457,401,5697,1,3,6,1
3,-1,0.003382,0.0,0.0,0.0,0.0,5e-06,0.0,0.001252,0.025229,0.914457,401,5697,1,3,6,1
4,-1,0.003382,0.0,0.0,0.0,0.0,5e-06,0.0,0.001252,0.025229,0.914457,401,5697,1,3,6,1


In [5]:
# SEE WHAT WAS ENCONDED - Retrieve encodings

# A. Label Encoding mappings
print("Label Encoding Mappings:")
for col, le in label_encoders.items():
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f"Column '{col}': {mapping}")

# B. Ordinal Encoding mappings
print("\nOrdinal Encoding Mapping for 'Client_importance':")
print(client_importance_mapping)

# C. Target Encoding (Client_type)
print("\nTarget Encoding: Client_type")
# Target encoding doesn't create an explicit mapping since it's based on mean target values.
# To inspect the encoded values, you can group by the original column to see the target means:
original_client_type_mapping = df_normalized.groupby('Client_type')['Bid_Notbid_binary'].mean().to_dict()
print(original_client_type_mapping)

# D. Frequency Encoding mappings
print("\nFrequency Encoding Mappings:")
# Market_segment
market_segment_freq = df_normalized['Market_segment'].value_counts().to_dict()
print("Market_segment:", market_segment_freq)

# Client_BU
client_bu_freq = df_normalized['Client_BU'].value_counts().to_dict()
print("Client_BU:", client_bu_freq)

# E. Confirm dropped columns
print("\nDropped Columns: ['SR_codigo', 'Bid_Notbid']")


Label Encoding Mappings:
Column 'Country': {'AD': 0, 'ES': 1, 'GB': 2, 'Not informed': 3, 'PT': 4}
Column 'Coverage_model': {'Delegated to Partner': 0, 'Inside Sales': 1, 'Marketing': 2, 'Not Covered': 3, 'Not informed': 4, 'Outsourced Sales': 5, 'Sales Rep': 6}

Ordinal Encoding Mapping for 'Client_importance':
{'MA3': 3, 'MA2': 2, 'MA1': 1, 'MG3': 3, 'MG2': 2, 'MG1': 1, 'MS3': 3, 'MS2': 2, 'MS1': 1, 'Not informed': 0}

Target Encoding: Client_type
{'B2B Distribution': 0.9310535024820739, 'Contractor': 0.9144569288389514, 'DIY & RETAILER': 0.9629629629629629, 'Design Firm': 0.9088507265521797, 'Digital & Service Provider': 0.9402173913043478, 'Electrician': 0.9366197183098591, 'End User': 0.9393063583815029, 'IT Channel Partner': 0.9516129032258065, 'Internal Schneider Entity': 0.8333333333333334, 'Not informed': 0.9208791208791208, 'Original Equipment Manufacturer': 0.9215017064846417, 'Panel Builder': 0.9522058823529411, 'System Integrator': 0.9328358208955224}

Frequency Encoding M

## 2. Correlation Check:

After encoding, check the correlation of predictors with Bid_Notbid_binary to validate the effectiveness of the encodings.

In [6]:
# Correlation check

correlation_matrix = df_encoded.corr()
target_corr = correlation_matrix['Bid_Notbid_binary'].sort_values(ascending=False)
print(target_corr)

Bid_Notbid_binary    1.000000
Resolution_time      0.201763
Result_offer         0.153617
Client_type          0.049485
Country              0.020019
Amount_SP            0.014360
Amount_PP            0.012579
Coverage_model       0.011462
Amount_HD            0.002521
Market_segment       0.002371
Amount_OTHER         0.000435
Total_Amount        -0.001972
Amount_DE           -0.002777
Client_BU           -0.004583
Amount_PS           -0.009678
Amount_ID           -0.009994
Client_importance   -0.015368
Name: Bid_Notbid_binary, dtype: float64


## 3. Feature Importance Analysis

Use a Random Forest or XGBoost model to analyze feature importance, ensuring the encodings lead to meaningful predictors.

In [7]:
from xgboost import XGBClassifier

model = XGBClassifier()
X = df_encoded.drop(columns=['Bid_Notbid_binary'])
y = df_encoded['Bid_Notbid_binary']
model.fit(X, y)
importance = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_})
importance = importance.sort_values(by='Importance', ascending=False)
print(importance)

              Feature  Importance
9     Resolution_time    0.899748
0        Result_offer    0.031924
13            Country    0.012032
6           Amount_PS    0.006047
10        Client_type    0.005377
8        Total_Amount    0.005197
3           Amount_ID    0.005082
1           Amount_DE    0.004826
14  Client_importance    0.004593
12          Client_BU    0.004586
11     Market_segment    0.004430
5           Amount_PP    0.003799
15     Coverage_model    0.003671
2           Amount_HD    0.003658
7           Amount_SP    0.003032
4        Amount_OTHER    0.002000


## 4. Cross-Validation

Ensure the model performance is robust by using cross-validation:

In [8]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print(f"Average AUC: {scores.mean():.4f}")

Average AUC: 0.9814


**Analysis**
Interpretation of the AUC Score
AUC measures the ability of the model to distinguish between the positive and negative classes.
AUC ranges from 0 to 1:
0.5: The model is no better than random guessing.
>0.7: Indicates a good model.
>0.8: Indicates a strong model.
>0.9: Indicates an excellent model.


In [10]:
#Export to CSV
df_encoded.to_csv('04_df_encoded_bidnotbid.csv', index=False)