In [1]:
from google.colab import files

# Upload the single joined CSV you exported from Snowflake
uploaded = files.upload()


Saving 2025-06-16 11_46am.csv to 2025-06-16 11_46am.csv


In [2]:
import pandas as pd

df = pd.read_csv('2025-06-16 11_46am.csv')  # Change the filename if needed

print(df.head())         # Preview data
print(df.columns.tolist())  # Check column names


   LEAD_ID FIRST_NAME LAST_NAME                    EMAIL  \
0        1      Betty      Ross     xbrooks@anderson.net   
1       37    Darlene   Gilmore   nicolerivera@baker.org   
2       39     Amanda  Anderson           lkey@gmail.com   
3       76       Gina      Kirk   walterford@hotmail.com   
4       84      Tyler   Burnett  rebeccamoore@reeves.com   

                                               TITLE CREATED_DATE  \
0                        Civil engineer, contracting   2025-06-02   
1                              Clinical embryologist   2025-02-28   
2                                       Metallurgist   2025-02-28   
3                        Community education officer   2025-05-26   
4  Clinical scientist, histocompatibility and imm...   2025-04-29   

   LEAD_AGE_DAYS                  COMPANY_NAME       INDUSTRY  EMPLOYEE_COUNT  \
0             14                    Potter Inc         Retail            3863   
1            108  Taylor, Newman and Whitehead        Finance 

In [4]:
print(df.columns.tolist())

['LEAD_ID', 'FIRST_NAME', 'LAST_NAME', 'EMAIL', 'TITLE', 'CREATED_DATE', 'LEAD_AGE_DAYS', 'COMPANY_NAME', 'INDUSTRY', 'EMPLOYEE_COUNT', 'EMPLOYEE_COUNT_BUCKET', 'ANNUAL_REVENUE', 'ANNUAL_REVENUE_BUCKET', 'TECH_STACK', 'HQ_LOCATION', 'TOTAL_ENGAGEMENTS', 'UNIQUE_ENGAGEMENT_TYPES', 'LAST_ENGAGEMENT_DATE']


In [5]:
df.columns = df.columns.str.strip().str.lower().str.replace('-', '_')


In [6]:
df['total_engagements'] = df['total_engagements'].fillna(0)
df['target'] = (df['total_engagements'] > 3).astype(int)

print(df[['lead_id', 'total_engagements', 'target']].head())


   lead_id  total_engagements  target
0        1                  1       0
1       37                  2       0
2       39                  4       1
3       76                  3       0
4       84                  3       0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Select features and target
features = ['industry', 'employee_count', 'annual_revenue', 'tech_stack', 'title', 'total_engagements']
X = df[features]
y = df['target']

# One-hot encode categorical features
X_encoded = pd.get_dummies(X, columns=['industry', 'tech_stack', 'title'], drop_first=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [8]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       135
           1       1.00      1.00      1.00        65

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [9]:
# Predict probabilities for all rows
df['score'] = model.predict_proba(X_encoded)[:, 1]  # Probability of class 1

# Preview
df[['lead_id', 'score', 'target']].head()


Unnamed: 0,lead_id,score,target
0,1,0.0,0
1,37,0.03,0
2,39,0.98,1
3,76,0.0,0
4,84,0.0,0


In [10]:
# Save as CSV
df.to_csv('scored_leads.csv', index=False)

# Download from Colab
from google.colab import files
files.download('scored_leads.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>