In [1]:
# --- Smart Lead AI: Model Training ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

# --- 1. Load the Processed Data ---
# We use the clean, model-ready data from Module 2
df = pd.read_csv('../data/lead_data_processed.csv')

print("Loaded processed data:")
print(df.head())

# --- 2. Define Features (X) and Target (y) ---

# X = All columns EXCEPT 'converted'. These are the "features" or "inputs".
X = df.drop('converted', axis=1)

# y = ONLY the 'converted' column. This is the "target" or "answer" we want to predict.
y = df['converted']

# --- 3. Create Training and Testing Sets ---
# This splits our data: 80% for training, 20% for testing
# random_state=42 ensures we get the same "random" split every time we run this
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Loaded processed data:
   pages_visited  time_on_site  email_opens  interaction_score  \
0             13         10.09            6                 90   
1              2         18.55            9                 17   
2             15         29.43            6                 84   
3             13         19.82            9                 96   
4             14          1.84            7                 59   

   profile_complete  previous_purchases  converted  lead_source_Facebook  \
0                52                   1          1                 False   
1                77                   1          1                 False   
2                56                   0          1                 False   
3                93                   1          1                 False   
4                82                   1          1                  True   

   lead_source_Google Ads  lead_source_Instagram  lead_source_LinkedIn  \
0                   False                   True 

In [2]:
# --- 4. Train the Model ---

# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)

# Train the model using the training data
# This is where the "learning" happens!
print("Training the model...")
model.fit(X_train, y_train)

print("Model training complete!")

# --- 5. Evaluate the Model ---

# Make predictions on the testing set (the data the model has never seen)
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"\nModel Accuracy: {accuracy * 100:.2f}%")

Training the model...
Model training complete!

Model Accuracy: 91.00%


In [None]:
# --- 6. Save the Model ---
import os

# Create the 'model' folder if it doesn't exist
os.makedirs('../model', exist_ok=True)

# Save the trained model to a file
joblib.dump(model, '../model/model.pkl')

print("Model saved successfully to 'model/model.pkl'!")