In [17]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
import category_encoders as ce
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# I. Read dataset

In [18]:
df_normalized_notdecided = pd.read_csv('04_df_normalized_PRED_notdecided.csv', delimiter=',')
df_normalized_notdecided.head()

Unnamed: 0,Result_offer,Amount_DE,Amount_HD,Amount_ID,Amount_OTHER,Amount_PP,Amount_PS,Amount_SP,Total_Amount,SR_codigo,Bid_Notbid,Resolution_time,Client_type,Market_segment,Client_BU,Country,Client_importance,Coverage_model
0,Pipeline,0.0,0.0,0.0,0.0012,0.0,0.0,0.0,0.0001006678,SR-3449381,Not_decided_yet,0.024497,Contractor,BD4,Field Services,ES,MA1,Sales Rep
1,Pipeline,9.090909e-09,0.0,0.0,0.0,0.0,0.0,0.0,3.355592e-09,SR-3449749,Not_decided_yet,0.024497,End User,E9B,Power Products,ES,MA3,Sales Rep
2,Pipeline,0.0,0.0,0.0,0.0,0.0,2.7e-05,0.0,1.677796e-05,SR-3449844,Not_decided_yet,0.024497,End User,EN3,Power Systems,ES,Not informed,Marketing
3,Pipeline,2.727273e-05,0.0,0.000188,0.0,0.0,0.0,0.0,2.013355e-05,SR-3465617,Not_decided_yet,0.024497,B2B Distribution,Not informed,Home & Distribution,ES,MG1,Inside Sales
4,Lost,1.818182e-05,0.0,0.0,0.0,0.0,0.0,0.0,6.711184e-06,SR-3511658,Not_decided_yet,0.024497,Contractor,BDZ,Power Products,ES,MG2,Marketing


# II. Encoding using same techniques for df used for ML training

In [19]:
# Load the dataset
df_encoded = df_normalized_notdecided.copy()

# Encoding mappings
country_mapping = {'AD': 0, 'ES': 1, 'GB': 2, 'Not informed': 3, 'PT': 4}
coverage_model_mapping = {'Delegated to Partner': 0, 'Inside Sales': 1, 'Marketing': 2, 'Not Covered': 3,
                          'Not informed': 4, 'Outsourced Sales': 5, 'Sales Rep': 6}
client_importance_mapping = {'MA3': 3, 'MA2': 2, 'MA1': 1, 'MG3': 3, 'MG2': 2, 'MG1': 1,
                             'MS3': 3, 'MS2': 2, 'MS1': 1, 'Not informed': 0}
client_type_mapping = {'B2B Distribution': 0.5342105263157895, 'Contractor': 0.45616605616605616,
                       'DIY & RETAILER': 0.6111111111111112, 'Design Firm': 0.5272727272727272,
                       'Digital & Service Provider': 0.6106194690265486, 'Electrician': 0.47555555555555556,
                       'End User': 0.7135789832821563, 'IT Channel Partner': 0.5948275862068966,
                       'Internal Schneider Entity': 0.4, 'Not informed': 0.5458333333333333,
                       'Original Equipment Manufacturer': 0.5647668393782384, 'Panel Builder': 0.5777202072538861,
                       'System Integrator': 0.58203125}
market_segment_mapping = {'Not informed': 2245, 'IDZ': 1069, 'BDZ': 1037, 'EN2': 436, 'B41': 304}
client_BU_mapping = {'Power Products': 3473, 'INDUSTRIAL AUTOMATION': 2861, 'Power Systems': 2146,
                     'Home & Distribution': 1643, 'Digital Energy': 1080, 'Secure Power': 699,
                     'Field Services': 563, 'Energy Management': 367, 'Not informed': 240}

# Apply mappings to the DataFrame
df_encoded['Country'] = df_encoded['Country'].map(country_mapping)
df_encoded['Coverage_model'] = df_encoded['Coverage_model'].map(coverage_model_mapping)
df_encoded['Client_importance'] = df_encoded['Client_importance'].map(client_importance_mapping)
df_encoded['Client_type'] = df_encoded['Client_type'].map(client_type_mapping)
df_encoded['Market_segment'] = df_encoded['Market_segment'].map(market_segment_mapping)
df_encoded['Client_BU'] = df_encoded['Client_BU'].map(client_BU_mapping)

# Handle missing values with alternate encoding methods

# A. Label Encoding for 'Country' and 'Coverage_model'
label_columns = ['Country', 'Coverage_model']
label_encoders = {}
for col in label_columns:
    if df_encoded[col].isnull().any():
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
        label_encoders[col] = le

# B. Target Encoding for 'Client_type'
if df_encoded['Client_type'].isnull().any():
    target_encoder = ce.TargetEncoder(cols=['Client_type'])
    df_encoded = target_encoder.fit_transform(df_encoded, df_encoded['Result_offer_binary'])

# C. Frequency Encoding for 'Market_segment'
if df_encoded['Market_segment'].isnull().any():
    freq_encoding = df_encoded['Market_segment'].value_counts().to_dict()
    df_encoded['Market_segment'] = df_encoded['Market_segment'].map(freq_encoding)

# D. Frequency Encoding for 'Client_BU'
if df_encoded['Client_BU'].isnull().any():
    client_bu_freq = df_encoded['Client_BU'].value_counts().to_dict()
    df_encoded['Client_BU'] = df_encoded['Client_BU'].map(client_bu_freq)


# E1. Encoding SR_codigo

# Create a dictionary to encode 'SR_codigo'
unique_values = df_encoded['SR_codigo'].unique()
sr_codigo_encoder = {value: idx for idx, value in enumerate(unique_values)}
 
# Replace the values in the 'SR_codigo' column with their encoded values
df_encoded['SR_codigo'] = df_encoded['SR_codigo'].map(sr_codigo_encoder)

# Drop unnecessary columns
columns_to_drop = ['Result_offer', 'Bid_Notbid']
df_encoded = df_encoded.drop(columns=columns_to_drop, errors='ignore')

# Check for remaining missing values
df_encoded = df_encoded.fillna(0)  # Handle any remaining missing values

# Display the encoded DataFrame
print(df_encoded.head())


      Amount_DE  Amount_HD  Amount_ID  Amount_OTHER  Amount_PP  Amount_PS  \
0  0.000000e+00        0.0   0.000000        0.0012        0.0   0.000000   
1  9.090909e-09        0.0   0.000000        0.0000        0.0   0.000000   
2  0.000000e+00        0.0   0.000000        0.0000        0.0   0.000027   
3  2.727273e-05        0.0   0.000188        0.0000        0.0   0.000000   
4  1.818182e-05        0.0   0.000000        0.0000        0.0   0.000000   

   Amount_SP  Total_Amount  SR_codigo  Resolution_time  Client_type  \
0        0.0  1.006678e-04          0         0.024497     0.456166   
1        0.0  3.355592e-09          1         0.024497     0.713579   
2        0.0  1.677796e-05          2         0.024497     0.713579   
3        0.0  2.013355e-05          3         0.024497     0.534211   
4        0.0  6.711184e-06          4         0.024497     0.456166   

   Market_segment  Client_BU  Country  Client_importance  Coverage_model  
0             0.0        7.0       

In [21]:
#Export to CSV
df_encoded.to_csv('04_df_encoded_PRED_notdecided.csv', index=False)