In [2]:
import pandas as pd 
import joblib

In [6]:
df = pd.read_csv('data/processed_leads.csv')

In [8]:
df.sample(3)

Unnamed: 0,email,phone_number,age,age_group,credit_score,family_background,income,intent,comments
1962,sedwards@example.org,+91 6544489921,63,56-65,771,Married,1345445.32,High,Looking for high-quality solutions for my busi...
2776,hernandezalan@example.net,+91 9995596369,32,26-35,652,Married with Children,733267.04,High,"Attended your webinar, convinced about the val..."
5707,egregory@example.com,+91 9683062989,52,46-55,765,Single Parent,1513162.55,High,"Impressed with your product features, let's di..."


In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [12]:
X = df.drop(columns=['email','phone_number','intent'])

In [14]:
y = df['intent']

In [18]:
# Define column types
categorical_cols = ['age_group', 'family_background']
numeric_cols = ['credit_score', 'income', 'age']

# Build Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

# Apply transformer
X_transformed = preprocessor.fit_transform(X)

ohe = preprocessor.named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(categorical_cols)
final_feature_names = numeric_cols + list(ohe_feature_names)

# Save preprocessor pipeline
joblib.dump(preprocessor, "backend/model/preprocessor_pipeline.pkl")


['backend/model/preprocessor_pipeline.pkl']

In [20]:
X = pd.DataFrame(X_transformed, columns=final_feature_names)

In [25]:
X.columns

Index(['credit_score', 'income', 'age', 'age_group_18-25', 'age_group_26-35',
       'age_group_36-45', 'age_group_46-55', 'age_group_56-65',
       'age_group_65+', 'family_background_Divorced',
       'family_background_Married', 'family_background_Married with Children',
       'family_background_Single', 'family_background_Single Parent'],
      dtype='object')

In [27]:
['num__credit_score' 'num__income' 'num__age' 'cat__age_group_18-25'
 'cat__age_group_26-35' 'cat__age_group_36-45' 'cat__age_group_46-55'
 'cat__age_group_56-65' 'cat__age_group_65+'
 'cat__family_background_Divorced' 'cat__family_background_Married'
 'cat__family_background_Married with Children'
 'cat__family_background_Single' 'cat__family_background_Single Parent']

1

In [113]:
y= y.map({'High': 1, 'Low': 0})

In [115]:
y

0       0
1       0
2       0
3       0
4       1
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: intent, Length: 10000, dtype: int64

In [117]:
from sklearn.model_selection import train_test_split 


In [119]:

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [121]:
X_train

Unnamed: 0,credit_score,income,age,age_group_18-25,age_group_26-35,age_group_36-45,age_group_46-55,age_group_56-65,age_group_65+,family_background_Divorced,family_background_Married,family_background_Married with Children,family_background_Single,family_background_Single Parent
9036,0.461766,-0.527946,-0.481957,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7053,-0.532213,-0.565408,-0.620278,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5749,-1.156636,0.317126,-0.412797,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9803,-0.787080,-1.309908,-1.588522,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7779,1.277339,1.524844,0.140485,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7244,1.277339,0.668117,1.177890,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
8045,-0.659647,-0.179626,-1.381041,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3577,-0.149914,-1.393214,-1.381041,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6323,-0.200887,-0.665766,1.523692,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [205]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0]
}

grid = GridSearchCV(GradientBoostingClassifier(random_state=42),
                    param_grid, cv=3, scoring='f1')
grid.fit(X_train, y_train)
print("Best parameters:", grid.best_params_)


Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}


In [213]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=150, learning_rate=0.01,subsample=0.8,max_depth=4, random_state=42)
model.fit(X_train, y_train)


In [225]:
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report, confusion_matrix

In [214]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.78      0.80       830
           1       0.85      0.88      0.86      1170

    accuracy                           0.84      2000
   macro avg       0.83      0.83      0.83      2000
weighted avg       0.84      0.84      0.84      2000

[[ 650  180]
 [ 145 1025]]


In [192]:
accuracy_score(y_test,y_pred)

0.8345

In [227]:
import joblib
joblib.dump(model, "model/intent_model.pkl")


['model/intent_model.pkl']

In [235]:
probs = model.predict_proba(X_test)[:, 1]  # Probability of class 1 (High Intent)
scores = (probs * 100).astype(int)
scores

array([99, 42, 99, ..., 74,  6, 99])