In [20]:
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [21]:
# Load Train and Test datasets
train_df = pd.read_csv('DataSets\Processed_train_data.csv')
test_df = pd.read_csv('DataSets\Processed_test_data.csv')

# Define target variable
target = 'is_hot_lead'

# Splitting features and target
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_test = test_df.drop(columns=[target])
y_test = test_df[target]


In [22]:
# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f" SMOTE Applied: Before ({y_train.value_counts().to_dict()}), After ({y_train_smote.value_counts().to_dict()})")

 SMOTE Applied: Before ({0: 16980, 1: 2985}), After ({0: 16980, 1: 16980})


In [23]:
# Apply Standardization (Feature Scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)  # Fit & transform training data
X_test_scaled = scaler.transform(X_test)  # Transform test data

# Save the scaler for future use
joblib.dump(scaler, 'PKl_files/scaler.pkl')

['PKl_files/scaler.pkl']

In [24]:
X_test.head(10)

Unnamed: 0,company_name,industry,funding_rounds,total_funding,job_postings_30d,employee_growth_pct,hiring_roles,industry_growth_rate,regional_employment_trend,funding_per_employee,days_since_last_funding,growth_momentum,funding_year,funding_month,funding_day,funding_weekday,funding_quarter
0,-1,2,3.265246,2425223.28,7.201083,5.569041,2,8.0,8.4,3566.504824,1280,16976562.96,2021,8,20,4,3
1,-1,3,1.912502,3988620.41,14.562457,46.348625,0,7.1,4.3,890.317056,1729,55840685.74,2020,5,28,3,2
2,-1,4,5.425373,2645234.46,14.79794,34.12194,1,16.0,14.8,669.67961,1449,37033282.44,2021,3,4,3,1
3,-1,2,0.987882,3194492.15,14.946632,4.969312,3,21.1,5.9,5414.393475,849,41528397.95,2022,10,25,1,4
4,-1,2,3.583654,884497.73,9.714369,-7.111839,2,8.1,-4.3,-1499.148695,1572,9729475.03,2020,11,1,6,4
5,-1,3,2.340446,6076529.42,12.14248,0.27895,3,6.7,0.9,46742.534,216,72918353.04,2024,7,19,4,3
6,14877,1,3.421293,2256189.13,12.677383,23.829309,2,15.1,-4.8,951.978536,422,27074269.56,2023,12,26,1,4
7,10897,0,2.047862,6243040.88,10.374001,29.645961,3,11.9,12.3,2190.54066,1570,68673449.68,2020,11,3,1,4
8,-1,2,4.473383,4776184.28,7.923417,-1.367018,3,7.0,-4.7,-79603.071333,1189,42985658.52,2021,11,19,4,4
9,-1,4,1.002375,10849688.67,7.450955,30.147126,1,11.5,-3.6,4033.341513,345,75947820.69,2024,3,12,1,1


In [25]:
X_train_smote.head(10)

Unnamed: 0,company_name,industry,funding_rounds,total_funding,job_postings_30d,employee_growth_pct,hiring_roles,industry_growth_rate,regional_employment_trend,funding_per_employee,days_since_last_funding,growth_momentum,funding_year,funding_month,funding_day,funding_weekday,funding_quarter
0,9973,2,1.911533,3795597.04,10.995177,-8.355216,2,11.5,-0.2,-5930.620375,1317,45547164.48,2021,7,14,2,3
1,9938,1,3.221005,5792195.29,5.052385,20.602718,1,23.9,10.5,2106.252833,1734,34753171.74,2020,5,23,5,2
2,14014,4,3.975966,191991.0,5.008945,0.456013,2,20.5,-1.5,1371.364286,1750,959955.0,2020,5,7,3,2
3,12831,0,2.842382,890042.57,9.882709,1.202142,2,23.2,13.9,4238.297952,681,8010383.13,2023,4,11,1,2
4,3203,2,2.264359,3369725.77,11.461475,22.633126,1,9.0,8.6,1567.314312,330,40436709.24,2024,3,27,2,1
5,14637,1,4.875184,2881335.43,6.869437,32.667976,1,21.0,1.0,752.306901,1252,20169348.01,2021,9,17,4,3
6,9405,0,0.900992,7941005.0,7.892017,22.698968,1,14.5,-3.6,3214.981781,1085,63528040.0,2022,3,3,3,1
7,3444,0,1.170316,1716573.56,11.931091,8.556498,0,29.4,-1.2,1928.734337,1437,24032029.84,2021,3,16,1,1
8,6539,3,3.337948,178278.14,7.100843,30.058273,0,25.2,10.4,52.745012,173,1426225.12,2024,8,31,5,3
9,10332,4,3.864427,2266813.43,9.548882,-2.022682,0,14.4,9.2,-25186.815889,75,27201761.16,2024,12,7,5,4


# Model Training and Evaluation

In [26]:
# Define models in a dictionary (Including Gradient Boosting)
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(n_estimators=200, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss'),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, random_state=42)
}

# Dictionary to store results
results = []

# Loop through models, train and evaluate
for name, model in models.items():
    print(f"\n🔹 Training {name}...")
    
    # Train the model with SMOTE data
    model.fit(X_train_scaled, y_train_smote)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Append results to the list
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Confusion Matrix": conf_matrix
    })

    # Save the trained model
    joblib.dump(model, f"PKl_files/{name.replace(' ', '_')}.pkl")

# Convert results to a DataFrame
results_df = pd.DataFrame(results)


🔹 Training Logistic Regression...

🔹 Training Random Forest...

🔹 Training XGBoost...


Parameters: { "use_label_encoder" } are not used.




🔹 Training Decision Tree...

🔹 Training Gradient Boosting...


In [27]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
0,Logistic Regression,0.879607,0.599179,0.587131,0.593094,"[[3953, 293], [308, 438]]"
1,Random Forest,0.950521,0.798802,0.894102,0.84377,"[[4078, 168], [79, 667]]"
2,XGBoost,0.959135,0.820331,0.930295,0.871859,"[[4094, 152], [52, 694]]"
3,Decision Tree,0.92528,0.746367,0.757373,0.75183,"[[4054, 192], [181, 565]]"
4,Gradient Boosting,0.946314,0.766147,0.922252,0.836983,"[[4036, 210], [58, 688]]"


# HyperParameter Tuning

Training Model using randomized search

In [28]:
# Define parameter distributions for Randomized Search
param_distributions = {
    'n_estimators': np.arange(100, 500, 50),
    'learning_rate': np.linspace(0.01, 0.2, 10),
    'max_depth': np.arange(3, 10, 2),
    'subsample': np.linspace(0.6, 1.0, 5),
    'colsample_bytree': np.linspace(0.6, 1.0, 5),
    'gamma': np.linspace(0, 1, 5),
    'reg_lambda': np.linspace(0, 2, 5),
    'reg_alpha': np.linspace(0, 2, 5),
}

# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Perform Randomized Search with 5-fold cross-validation
random_search = RandomizedSearchCV(
    xgb_model, 
    param_distributions, 
    n_iter=50,  # Number of random combinations to try
    scoring='f1',  # Optimizing for F1-score
    cv=5, 
    verbose=2, 
    n_jobs=-1, 
    random_state=42
)

# Fit the model
random_search.fit(X_train_scaled, y_train_smote)

# Get the best parameters
best_params = random_search.best_params_
print(f"✅ Best Hyperparameters for XGBoost: {best_params}")

# Train the best XGBoost model with tuned parameters
best_xgb = xgb.XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss', random_state=42)
best_xgb.fit(X_train_scaled, y_train_smote)

# Save the best model
joblib.dump(best_xgb, "PKl_files/best_xgboost_model.pkl")


Fitting 5 folds for each of 50 candidates, totalling 250 fits


1 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "d:\DataScience\GUVI\DataScience_GUVI_Projects\Customer_Conversion_Analysis_MLOps\customer_Convesion_Analysis_VENV\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\DataScience\GUVI\DataScience_GUVI_Projects\Customer_Conversion_Analysis_MLOps\customer_Convesion_Analysis_VENV\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "d:\DataScience\GUVI\DataScience_GUVI_Projects\Customer_Conversion_Analysis_MLOps\customer_Convesion_Analysis_VENV\l

✅ Best Hyperparameters for XGBoost: {'subsample': 0.7, 'reg_lambda': 2.0, 'reg_alpha': 0.5, 'n_estimators': 400, 'max_depth': 9, 'learning_rate': 0.1788888888888889, 'gamma': 0.0, 'colsample_bytree': 0.7}


Parameters: { "use_label_encoder" } are not used.



['PKl_files/best_xgboost_model.pkl']

Testing Using trained model by Randomized Search

In [29]:
Hypertuned_results=[]

# Load the best XGBoost model
best_xgb = joblib.load("PKl_files/best_xgboost_model.pkl")

# Make predictions on the test set
y_test_pred = best_xgb.predict(X_test_scaled)

# Compute evaluation metrics
xgb_best_acc = accuracy_score(y_test, y_test_pred)
xgb_best_precision = precision_score(y_test, y_test_pred)
xgb_best_recall = recall_score(y_test, y_test_pred)
xgb_best_f1 = f1_score(y_test, y_test_pred)
xgb_best_conf_matrix = confusion_matrix(y_test, y_test_pred)

# Append the best tuned model's performance to the results DataFrame
best_model_results = {
    "Model": "XGBoost (Tuned)",
    "Accuracy": xgb_best_acc,
    "Precision": xgb_best_precision,
    "Recall": xgb_best_recall,
    "F1 Score": xgb_best_f1,
    "Confusion Matrix": xgb_best_conf_matrix
}

# Append the  result to the DataFrame
Hypertuned_results.append(best_model_results)
Hypetuned_df = pd.DataFrame(Hypertuned_results)

In [30]:
Hypetuned_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
0,XGBoost (Tuned),0.969351,0.863804,0.9437,0.901986,"[[4135, 111], [42, 704]]"


# Predicting 'is_hot_lead' for Holdout Data

In [31]:
df_holdout = pd.read_csv(r'DataSets\Processed_holdout_data.csv')

In [32]:
df_holdout.head(10)

Unnamed: 0,company_id,company_name,industry,funding_rounds,total_funding,job_postings_30d,employee_growth_pct,hiring_roles,industry_growth_rate,regional_employment_trend,funding_per_employee,days_since_last_funding,growth_momentum,funding_year,funding_month,funding_day,funding_weekday,funding_quarter
0,COMP_000000,-1,3,7,4189415.46,9,-2.0,3,17.3,12.9,-41894.1546,1793,37704739.14,2020,3,25,2,1
1,COMP_000001,14332,3,1,3964281.97,5,20.1,1,8.8,-2.8,1878.806621,523,19821409.85,2023,9,16,5,3
2,COMP_000002,-1,0,2,7548375.72,9,7.0,2,15.9,-2.8,9435.46965,259,67935381.48,2024,6,6,3,2
3,COMP_000003,3412,0,3,8418569.83,4,39.7,3,17.5,2.3,2068.444676,1155,33674279.32,2021,12,23,3,4
4,COMP_000004,5732,1,2,2449746.19,13,34.2,0,27.0,4.4,695.950622,1769,31846700.47,2020,4,18,5,2
5,COMP_000005,-1,4,4,3463349.02,8,27.4,0,24.1,5.5,1219.489092,678,27706792.16,2023,4,14,4,2
6,COMP_000006,-1,4,2,2820737.83,15,38.1,1,26.2,8.1,721.416325,586,42311067.45,2023,7,15,5,3
7,COMP_000007,-1,2,3,1988942.7,13,46.9,0,22.1,6.4,415.228121,528,25856255.1,2023,9,11,0,3
8,COMP_000008,11329,4,4,6413583.09,10,1.5,2,25.3,-3.4,25654.33236,867,64135830.9,2022,10,7,4,4
9,COMP_000009,-1,1,2,8342077.23,8,33.6,1,29.5,-0.3,2411.00498,941,66736617.84,2022,7,25,0,3


In [33]:
df_holdout.describe()

Unnamed: 0,company_name,industry,funding_rounds,total_funding,job_postings_30d,employee_growth_pct,hiring_roles,industry_growth_rate,regional_employment_trend,funding_per_employee,days_since_last_funding,growth_momentum,funding_year,funding_month,funding_day,funding_weekday,funding_quarter
count,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0
mean,1960.791165,2.058233,3.110442,4118849.0,9.726908,20.67751,1.578313,17.439157,4.848594,971.644017,908.676707,41211330.0,2022.14257,6.620482,15.985944,2.861446,2.518072
std,4176.186818,1.425064,1.409172,2927568.0,3.152993,17.291727,1.167506,7.213606,5.784153,42600.759002,511.02099,34740480.0,1.396228,3.499719,8.666281,1.997705,1.138619
min,-1.0,0.0,1.0,107354.5,1.0,-9.6,0.0,5.0,-5.0,-416600.634,2.0,377714.0,2020.0,1.0,1.0,0.0,1.0
25%,-1.0,1.0,2.0,1740622.0,8.0,5.8,0.0,11.425,-0.5,357.761146,485.25,14565930.0,2021.0,4.0,9.0,1.0,2.0
50%,-1.0,2.0,3.0,3595327.0,10.0,21.45,2.0,17.7,4.9,1140.484994,872.5,31381210.0,2022.0,6.0,16.0,3.0,2.0
75%,-1.0,3.0,4.0,5979098.0,12.0,35.875,3.0,23.875,9.6,2718.000973,1348.0,61164540.0,2023.0,10.0,23.0,5.0,4.0
max,16211.0,4.0,8.0,15216800.0,20.0,50.0,3.0,29.9,15.0,564662.3375,1823.0,197818400.0,2025.0,12.0,31.0,6.0,4.0


In [34]:
df_holdout.isna().sum()

company_id                   0
company_name                 0
industry                     0
funding_rounds               0
total_funding                0
job_postings_30d             0
employee_growth_pct          0
hiring_roles                 0
industry_growth_rate         0
regional_employment_trend    0
funding_per_employee         0
days_since_last_funding      0
growth_momentum              0
funding_year                 0
funding_month                0
funding_day                  0
funding_weekday              0
funding_quarter              0
dtype: int64

In [35]:
df_holdout.duplicated().sum()

0

In [None]:
#dropping the company_id column before predicting is hot lead
X_data = df_holdout.drop(columns=['company_id'])

In [37]:
X_data.head(10)

Unnamed: 0,company_name,industry,funding_rounds,total_funding,job_postings_30d,employee_growth_pct,hiring_roles,industry_growth_rate,regional_employment_trend,funding_per_employee,days_since_last_funding,growth_momentum,funding_year,funding_month,funding_day,funding_weekday,funding_quarter
0,-1,3,7,4189415.46,9,-2.0,3,17.3,12.9,-41894.1546,1793,37704739.14,2020,3,25,2,1
1,14332,3,1,3964281.97,5,20.1,1,8.8,-2.8,1878.806621,523,19821409.85,2023,9,16,5,3
2,-1,0,2,7548375.72,9,7.0,2,15.9,-2.8,9435.46965,259,67935381.48,2024,6,6,3,2
3,3412,0,3,8418569.83,4,39.7,3,17.5,2.3,2068.444676,1155,33674279.32,2021,12,23,3,4
4,5732,1,2,2449746.19,13,34.2,0,27.0,4.4,695.950622,1769,31846700.47,2020,4,18,5,2
5,-1,4,4,3463349.02,8,27.4,0,24.1,5.5,1219.489092,678,27706792.16,2023,4,14,4,2
6,-1,4,2,2820737.83,15,38.1,1,26.2,8.1,721.416325,586,42311067.45,2023,7,15,5,3
7,-1,2,3,1988942.7,13,46.9,0,22.1,6.4,415.228121,528,25856255.1,2023,9,11,0,3
8,11329,4,4,6413583.09,10,1.5,2,25.3,-3.4,25654.33236,867,64135830.9,2022,10,7,4,4
9,-1,1,2,8342077.23,8,33.6,1,29.5,-0.3,2411.00498,941,66736617.84,2022,7,25,0,3


In [39]:
X_data.describe()

Unnamed: 0,company_name,industry,funding_rounds,total_funding,job_postings_30d,employee_growth_pct,hiring_roles,industry_growth_rate,regional_employment_trend,funding_per_employee,days_since_last_funding,growth_momentum,funding_year,funding_month,funding_day,funding_weekday,funding_quarter
count,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0,498.0
mean,1960.791165,2.058233,3.110442,4118849.0,9.726908,20.67751,1.578313,17.439157,4.848594,971.644017,908.676707,41211330.0,2022.14257,6.620482,15.985944,2.861446,2.518072
std,4176.186818,1.425064,1.409172,2927568.0,3.152993,17.291727,1.167506,7.213606,5.784153,42600.759002,511.02099,34740480.0,1.396228,3.499719,8.666281,1.997705,1.138619
min,-1.0,0.0,1.0,107354.5,1.0,-9.6,0.0,5.0,-5.0,-416600.634,2.0,377714.0,2020.0,1.0,1.0,0.0,1.0
25%,-1.0,1.0,2.0,1740622.0,8.0,5.8,0.0,11.425,-0.5,357.761146,485.25,14565930.0,2021.0,4.0,9.0,1.0,2.0
50%,-1.0,2.0,3.0,3595327.0,10.0,21.45,2.0,17.7,4.9,1140.484994,872.5,31381210.0,2022.0,6.0,16.0,3.0,2.0
75%,-1.0,3.0,4.0,5979098.0,12.0,35.875,3.0,23.875,9.6,2718.000973,1348.0,61164540.0,2023.0,10.0,23.0,5.0,4.0
max,16211.0,4.0,8.0,15216800.0,20.0,50.0,3.0,29.9,15.0,564662.3375,1823.0,197818400.0,2025.0,12.0,31.0,6.0,4.0


In [None]:
# standardization the df_holdout data
X_data_scaled = scaler.transform(X_data)

predicting 'is_hot_lead' for processed_holdout_data

In [40]:
# Make predictions
predictions = best_xgb.predict(X_data_scaled)

# Create output DataFrame
output = pd.DataFrame({"company_id": df_holdout['company_id'], "Is Hot Lead": predictions})

# Save predictions
output.to_csv("Datasets/submission.csv", index=False)