In [None]:
!pip install pandas scikit-learn matplotlib seaborn joblib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os

sns.set(style="whitegrid")
# Ensure plotly is installed for consistency with other notebooks
try:
    import google.colab
    !pip install plotly
except ImportError:
    print('Running in Jupyter. Ensure plotly is installed with `pip install plotly`')

### Step 1: Generate Synthetic Leads Data (if not already present)

In [None]:
import pandas as pd
import numpy as np
import os

# Create notebooks/data directory in root
os.makedirs('data', exist_ok=True)

# Generate synthetic leads data if file doesn't exist
if not os.path.exists('notebooks/data/leads_data.csv'):
    np.random.seed(42)
    n_samples = 1000
    data = {
        'lead_id': range(1, n_samples + 1),
        'source': np.random.choice(['Advertisement', 'Social Media', 'Website', 'Referral'], n_samples),
        'industry': np.random.choice(['Finance', 'Healthcare', 'Tech', 'Retail'], n_samples),
        'engagement_score': np.random.randint(0, 100, n_samples),
        'time_spent': np.random.uniform(0, 50, n_samples),
        'converted': np.random.choice([0, 1], n_samples, p=[0.8, 0.2])
    }
    df_leads = pd.DataFrame(data)
    df_leads.to_csv('data/leads_data.csv', index=False)
    print('✅ Generated and saved leads_data.csv to notebooks/data/')
else:
    print('✅ leads_data.csv already exists in notebooks/data/')


### Step 2: Load Leads Data

In [None]:
df = pd.read_csv('data/leads_data.csv')
print(f'✅ Dataset Loaded. Total Records: {len(df)}')
df.head()


### Step 3: Data Preprocessing

In [None]:
# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=['source', 'industry'], drop_first=True)

# Define features and target
X = df_encoded.drop(['lead_id', 'converted'], axis=1)
y = df_encoded['converted']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('✅ Data preprocessed and split into train/test sets')
print(f'Features used: {X.columns.tolist()}')


### Step 4: Train Random Forest Model

In [None]:
# Train Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'✅ Model Accuracy: {accuracy:.2f}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred, zero_division=1))

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


### Step 5: Feature Importance

In [None]:
importances = model.feature_importances_
feature_names = X.columns
feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=feature_imp_df)
plt.title('Feature Importance')
plt.show()


### Step 6: Save Model

In [None]:
# Create model directory
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
models_dir = os.path.join(root_dir, 'marketsense_ai', 'backend', 'models')
os.makedirs(models_dir, exist_ok=True)

model_path = os.path.join(models_dir, 'lead_scoring_model.joblib')
joblib.dump(model, model_path)

print(f'✅ Model saved to {model_path}')

### Step 7: Save Data with Predictions

In [None]:
df['predicted_conversion'] = model.predict(X)
df.to_csv('data/leads_with_predictions.csv', index=False)
print('✅ Data with predictions saved to notebooks/data/leads_with_predictions.csv')
