In [23]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

# I. Read dataset

In [24]:
df_cleaned=pd.read_csv('01_df_cleaned.csv', delimiter=',')
df_cleaned.head()
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21471 entries, 0 to 21470
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Result_offer       21471 non-null  object 
 1   Amount_DE          21471 non-null  float64
 2   Amount_HD          21471 non-null  float64
 3   Amount_ID          21471 non-null  float64
 4   Amount_OTHER       21471 non-null  float64
 5   Amount_PP          21471 non-null  float64
 6   Amount_PS          21471 non-null  float64
 7   Amount_SP          21471 non-null  float64
 8   Total_Amount       21471 non-null  float64
 9   SR_codigo          21471 non-null  object 
 10  Bid_Notbid         21471 non-null  object 
 11  Resolution_time    21471 non-null  float64
 12  Client_type        21471 non-null  object 
 13  Market_segment     21471 non-null  object 
 14  Client_BU          21471 non-null  object 
 15  Country            21471 non-null  object 
 16  Client_importance  214

# II. Normalization for ML

In [25]:
# Columns to normalize
df_normalized = df_cleaned.copy()

columns_to_normalize = ['Amount_DE', 'Amount_HD', 'Amount_ID',
                        'Amount_OTHER', 'Amount_PP', 'Amount_PS', 'Amount_SP',
                        'Total_Amount', 'Resolution_time']
scaler = MinMaxScaler()

df_normalized[columns_to_normalize] = scaler.fit_transform(df_normalized[columns_to_normalize])

df_normalized.head()

Unnamed: 0,Result_offer,Amount_DE,Amount_HD,Amount_ID,Amount_OTHER,Amount_PP,Amount_PS,Amount_SP,Total_Amount,SR_codigo,Bid_Notbid,Resolution_time,Client_type,Market_segment,Client_BU,Country,Client_importance,Coverage_model
0,Lost,0.001945,0.0,0.0,0.0,0.0,0.0,0.0,0.000718,SR-2468496,Bid,0.078611,Contractor,BDZ,Power Products,ES,MA3,Sales Rep
1,Won,0.002045,0.0,0.1,0.0,0.02803,0.020478,0.0,0.021526,SR-2468507,Bid,0.235466,Contractor,E34,INDUSTRIAL AUTOMATION,ES,MS3,Sales Rep
2,Lost,0.003382,0.0,0.0,0.0,0.0,5e-06,0.0,0.001252,SR-2468656,Bid,0.032176,Contractor,I9Z,Power Products,ES,MS3,Sales Rep
3,Lost,0.003382,0.0,0.0,0.0,0.0,5e-06,0.0,0.001252,SR-2468663,Bid,0.025229,Contractor,I9Z,Power Products,ES,MS3,Sales Rep
4,Lost,0.003382,0.0,0.0,0.0,0.0,5e-06,0.0,0.001252,SR-2468668,Bid,0.025229,Contractor,I9Z,Power Products,ES,MS3,Sales Rep


# III. Target variables

## Bid-Notbid

In [26]:
# Filter the DataFrame where Bid_Notbid is 'Bid' or 'Not_bid'
df_normalized_bidnotbid = df_normalized[df_normalized['Bid_Notbid'].isin(['Bid', 'Not_bid'])]

In [27]:
# Ensure df_normalized_bidnotbid is a copy of the original DataFrame
df_normalized_bidnotbid = df_normalized_bidnotbid.copy()

# Create a binary target variable for bid/not bid
df_normalized_bidnotbid['Bid_Notbid_binary'] = df_normalized_bidnotbid['Bid_Notbid'].apply(lambda x: 1 if x == 'Bid' else 0)

# Verify the transformation
print(df_normalized_bidnotbid['Bid_Notbid_binary'].value_counts())


Bid_Notbid_binary
1    19746
0     1504
Name: count, dtype: int64


## Won-Lost

In [28]:
# Filter the DataFrame where Result_offer = 'Won' or 'Lost' and Bid_Notbid = 'Bid'
df_normalized_wonlost = df_normalized[
    (df_normalized['Result_offer'].isin(['Won', 'Lost'])) & 
    (df_normalized['Bid_Notbid'].isin(['Bid']))
]

In [29]:
# Ensure df_normalized_wonlost is a copy of the original DataFrame
df_normalized_wonlost = df_normalized_wonlost.copy()

# Create a binary target variable
df_normalized_wonlost['Result_offer_binary'] = df_normalized_wonlost['Result_offer'].apply(lambda x: 1 if x == 'Won' else 0)

# Verify the transformation
print(df_normalized_wonlost['Result_offer_binary'].value_counts())

Result_offer_binary
1    7853
0    5414
Name: count, dtype: int64


In [30]:
# Check class distribution (to check the balance between the Won (1) and Not Won (0) classes)
class_counts = df_normalized_wonlost['Result_offer_binary'].value_counts(normalize=True) * 100
print("Class distribution (%):")
print(class_counts)


Class distribution (%):
Result_offer_binary
1    59.19198
0    40.80802
Name: proportion, dtype: float64


# IV. For predictions

In [31]:
# Filter the DataFrame where Bid_Notbid is 'Not_decided_yet'
df_normalized_notdecided = df_normalized[df_normalized['Bid_Notbid'].isin(['Not_decided_yet'])]

In [32]:
# Filter the DataFrame where Result_offer = 'Pipeline' and Bid_Notbid = 'Bid'
df_normalized_pipeline = df_normalized[
    (df_normalized['Result_offer'].isin(['Pipeline'])) &
    (df_normalized['Bid_Notbid'].isin(['Bid']))
]

# IV. Exporting to CSV

Final dataset for ML training Bid/Notbid (ML):
- target variable bid/notbid
- excluding not_decided_yet
Final dataset for ML training Won/Lost (ML): 
 - target variable won/lost
 - filtered to where result_offer is 'Won' or 'Lost' only --> For training the machine
 - filter to bid only


Final datasets for prediction with ML:
- Not decided yet: where Bid_Notbid is 'Not_decided_yet'
- Pipeline: where Bid_Notbid is 'Bid' and Result_offer is 'Pipeline'

In [35]:
#Export to CSV

#For ML training
df_normalized_bidnotbid.to_csv('03_df_normalized_bidnotbid.csv', index=False)
df_normalized_wonlost.to_csv('03_df_normalized_wonlost.csv', index=False)

#For ML predictions
df_normalized_notdecided.to_csv('03_df_normalized_PRED_notdecided.csv', index=False)
df_normalized_pipeline.to_csv('03_df_normalized_PRED_pipeline.csv', index=False)
