In [240]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import sklearn.linear_model
import warnings
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import joblib
from sklearn.ensemble import GradientBoostingClassifier

In [241]:
df_leads = pd.read_csv('cluster_data.csv')
null_columns = list(df_leads.columns[df_leads.isna().all()])
df_leads = df_leads.drop(columns=null_columns)

  df_leads = pd.read_csv('cluster_data.csv')


In [242]:
df_leads.dropna(subset=["Cluster"],inplace=True)
df_leads = df_leads.replace("-", pd.NA)

##### Merging both the instances of "Installation Partner Allocation"

In [243]:
df_leads['Last Updated Stage'] = df_leads['Last Updated Stage'].replace("Installation partner Allocation", "Installation Partner Allocation")


In [244]:
loss_cases = ['Order Lost', 'Design Infeasible'] #Assign 0
success = ['Site Survey'] #Assign 1
process_phases = ['Lead Generation','Sales Partner Assignment','Communication'] #Don't Consider these phases


## Date Attributes

In [245]:
date_attributes = list(df_leads.filter(regex='creation date|deadline date', axis=1).columns)

## List of Phases

In [246]:
deadline_loss_phases = list(df_leads.filter(regex='cold_lead|design_infeasible|order_lost|deadline', axis=1).columns)
all_phases = [item for item in date_attributes if item not in deadline_loss_phases]
pre_survey_no_deadline = list(df_leads.filter(regex='lead_generation - deadline date|sales_partner_assignment - deadline date|communication - deadline date', axis=1).columns) #Phases that precede site survey
post_survey_no_deadline = [item for item in date_attributes if item not in pre_survey_no_deadline]

loss_reason = ['Cold Lead', 'Design Infeasible', 'Order Lost'] # All loss reasons
loss_reason_timestamp = ['cold_lead - creation date','design_infeasible - creation date','order_lost - creation date']

##### Mapping the Last updated stage names to the columns names

In [247]:
def convert_phase_name(phase_name):
    """Convert a phase name to its corresponding format in 'Preceding Phase' column."""
    converted_name = phase_name.replace(" ", "_").lower() + " - creation date"
    return converted_name

def create_mapping_dict(phase_list):
    """Create a mapping dictionary from a list of phase names."""
    mapping_dict = {phase: convert_phase_name(phase) for phase in phase_list}
    return mapping_dict

# Use the function to create the mapping dictionary 
mapping_dict = create_mapping_dict(list(df_leads['Last Updated Stage'].unique()))
print(mapping_dict)

{'Lead Generation': 'lead_generation - creation date', 'Communication': 'communication - creation date', 'Sales Partner Assignment': 'sales_partner_assignment - creation date', 'Cold Lead': 'cold_lead - creation date', 'Technical Design Assessment': 'technical_design_assessment - creation date', 'Proposal': 'proposal - creation date', 'Order Lost': 'order_lost - creation date', 'Design & Material Planning': 'design_&_material_planning - creation date', 'Installation Partner Allocation': 'installation_partner_allocation - creation date', 'New Project': 'new_project - creation date', 'Site Survey': 'site_survey - creation date', 'Material Dispatch': 'material_dispatch - creation date', 'DISCOM NOC Received': 'discom_noc_received - creation date', 'Final Commercial': 'final_commercial - creation date', 'Material Received at Site': 'material_received_at_site - creation date', 'Net Metering Application': 'net_metering_application - creation date', 'Customer KYC Verification': 'customer_kyc_

## Converting to DateTime

In [249]:
df_leads[date_attributes] = df_leads[date_attributes].applymap(lambda x: pd.to_datetime(x))
df_leads[['Last Updated On','Created On']] = df_leads[['Last Updated On','Created On']].applymap(lambda x: pd.to_datetime(x))

  df_leads[date_attributes] = df_leads[date_attributes].applymap(lambda x: pd.to_datetime(x))
  df_leads[['Last Updated On','Created On']] = df_leads[['Last Updated On','Created On']].applymap(lambda x: pd.to_datetime(x))


## Get phases in order

In [250]:
def move_to_index(lst, target, index):
    """Move target element in lst to the specified index."""
    if target in lst:
        lst.remove(target)
        lst.insert(index, target)
    return lst

move_to_index(post_survey_no_deadline, 'technical_assessment - creation date', 1)
move_to_index(all_phases, 'technical_assessment - creation date', 4)
move_to_index(all_phases, 'sales_partner_assignment - creation date', 1)



['lead_generation - creation date',
 'sales_partner_assignment - creation date',
 'communication - creation date',
 'site_survey - creation date',
 'technical_assessment - creation date',
 'proposal - creation date',
 'final_commercial - creation date',
 'customer_kyc - creation date',
 'kyc_verification - creation date',
 'net_metering_application - creation date',
 'design_and_material_planning - creation date',
 'discom_noc_received - creation date',
 'installation_partner_allocation - creation date',
 'material_dispatch - creation date',
 'material_received_at_site - creation date',
 'installation - creation date',
 'site_qc - creation date',
 'meter_installation - creation date',
 'logger_data_config - creation date',
 'site_commissioned - creation date']

## Function for preceding phase

In [251]:
# Create a dictionary mapping loss reasons to their timestamp columns
loss_reason_to_timestamp = dict(zip(loss_reason, loss_reason_timestamp))

def get_preceding_phase(row):
    if row['Last Updated Stage'] in loss_reason:
        loss_timestamp_col = loss_reason_to_timestamp[row['Last Updated Stage']]
        loss_timestamp = pd.Timestamp(row[loss_timestamp_col])
        
        # Filter out the phase columns only and drop the NaNs
        phase_timestamps = row[all_phases].dropna()   #Just change this to pre_survey_no_deadline 
        
        # Get the most recent phase timestamp just before the loss timestamp
        preceding_phase_timestamp = max([ts for ts in phase_timestamps if pd.Timestamp(ts) < loss_timestamp], default=None)
        
        if preceding_phase_timestamp:
            return phase_timestamps[phase_timestamps == preceding_phase_timestamp].index[0]
    return None

df_leads['Preceding Phase'] = df_leads.apply(get_preceding_phase, axis=1)



# Creating the Target Attribute

In [252]:
# Filter the dataframe for rows where 'Last Updated Stage' is 'Cold Lead'
# Get the 'Preceding Phase' column values for these rows
cold_lead_preceding_phases = df_leads[df_leads['Last Updated Stage'] == 'Cold Lead']['Preceding Phase']
dead_lead_preceding_phases = df_leads[df_leads['Last Updated Stage'].isin(['Order Lost', 'Design Infeasible'])]['Preceding Phase']

## Assign Dead Leads

In [253]:
# Filter the original dataframe using the conditions of dead_lead_preceding_phases
dead_lead_rows = df_leads[df_leads['Last Updated Stage'].isin(["Order Lost", "Design Infeasible"])]

# Get value counts of dead_lead_preceding_phases
dead_lead_counts_pre = dead_lead_preceding_phases.value_counts()

# Filter counts based on the mapped names from survey_onwards_phases
pre_dead_leads = dead_lead_counts_pre[dead_lead_counts_pre.index.isin(mapping_dict.values())]

# Filter the rows based on post_dead_leads criteria using the mapping_dict
relevant_dead_rows = dead_lead_rows[dead_lead_rows['Preceding Phase'].isin(pre_dead_leads.index)]

# Capture the indices of these rows
relevant_dead_indices = relevant_dead_rows.index


# Use dead_indices to set the 'Target' value to 'Dead' in the original dataframe
df_leads.loc[relevant_dead_indices, 'Target'] = 'Dead'


  df_leads.loc[relevant_dead_indices, 'Target'] = 'Dead'


## Assign Colds

In [254]:
# Assuming the mapping function and phase list are already defined:
mapping_dict = create_mapping_dict(['Lead Generation', 'Communication', 'Sales Partner Assignment'])

# Get value counts of cold_lead_preceding_phases
cold_lead_counts_pre = cold_lead_preceding_phases.value_counts()

# Filter counts based on the mapped names from survey_onwards_phases
pre_cold_leads = cold_lead_counts_pre[cold_lead_counts_pre.index.isin(mapping_dict.values())]

# Filter the original dataframe using the conditions of cold_lead_preceding_phases
cold_lead_rows = df_leads[df_leads['Last Updated Stage'] == "Cold Lead"]

# Filter the rows based on post_cold_leads criteria using the mapping_dict
relevant_cold_rows = cold_lead_rows[cold_lead_rows['Preceding Phase'].isin(pre_cold_leads.index)]

# Capture the indices of these rows
relevant_cold_indices = relevant_cold_rows.index


# Use cold_indices to set the 'Target' value to 'Cold' in the original dataframe
df_leads.loc[relevant_cold_indices, 'Target'] = 'Cold'

In [255]:
relevant_phases = ['Lead Generation',
 'Sales Partner Assignment',
 'Communication',
 'Site Survey',
 'Cold Lead',
 'Design Infeasible',
 'Order Lost']
phases = list(df_leads['Last Updated Stage'].unique())
post_survey_process = [phase for phase in phases if phase not in relevant_phases]
post_survey_process = post_survey_process + ["Site Survey"]

## Colds as Success

In [256]:
# Assuming the mapping function and phase list are already defined:
mapping_dict = create_mapping_dict(post_survey_process)

# Get value counts of cold_lead_preceding_phases
cold_lead_counts_post = cold_lead_preceding_phases.value_counts()

# Filter counts based on the mapped names from survey_onwards_phases
post_cold_leads = cold_lead_counts_post[cold_lead_counts_post.index.isin(mapping_dict.values())]

# print(post_cold_leads)
### Assign Colds
# Filter the original dataframe using the conditions of cold_lead_preceding_phases
cold_lead_rows = df_leads[df_leads['Last Updated Stage'] == "Cold Lead"]

# Filter the rows based on post_cold_leads criteria using the mapping_dict
relevant_cold_rows = cold_lead_rows[cold_lead_rows['Preceding Phase'].isin(post_cold_leads.index)]

# Capture the indices of these rows
relevant_cold_indices = relevant_cold_rows.index


# Use cold_indices to set the 'Target' value to 'Cold' in the original dataframe
df_leads.loc[relevant_cold_indices, 'Target'] = 'Success'

## Deads as Success

In [257]:

# Assuming you already have the mapping function and the phase list:
mapping_dict = create_mapping_dict(post_survey_process)

# Get value counts of dead_lead_preceding_phases
dead_lead_counts_post = dead_lead_preceding_phases.value_counts()

# Filter counts based on the mapped names from survey_onwards_phases
post_dead_leads = dead_lead_counts_post[dead_lead_counts_post.index.isin(mapping_dict.values())]

# print(post_dead_leads)
### Assign as Colds

# Filter the original dataframe using the conditions of dead_lead_preceding_phases
dead_lead_rows = df_leads[df_leads['Last Updated Stage'].isin(["Order Lost", "Design Infeasible"])]

# Get value counts of dead_lead_preceding_phases
dead_lead_counts_post = dead_lead_preceding_phases.value_counts()

# Filter counts based on the mapped names from survey_onwards_phases
post_dead_leads = dead_lead_counts_post[dead_lead_counts_post.index.isin(mapping_dict.values())]

# Filter the rows based on post_dead_leads criteria using the mapping_dict
relevant_dead_rows = dead_lead_rows[dead_lead_rows['Preceding Phase'].isin(post_dead_leads.index)]

# Capture the indices of these rows
relevant_dead_indices = relevant_dead_rows.index



# Use dead_indices to set the 'Target' value to 'Dead' in the original dataframe
df_leads.loc[relevant_dead_indices, 'Target'] = 'Success'




## Post Survey processes as Success

In [258]:
# Filter rows where 'Last Updated Stage' is in process_phases list
condition_in_process = df_leads['Last Updated Stage'].isin(post_survey_process)

# Update 'Target' column based on the condition
df_leads.loc[condition_in_process, 'Target'] = 'Success'

## In-Process Variable

In [259]:
# Filter rows where 'Last Updated Stage' is in process_phases list
condition_in_process = df_leads['Last Updated Stage'].isin(['Lead Generation', 'Sales Partner Assignment', 'Communication'])

# Update 'Target' column based on the condition
df_leads.loc[condition_in_process, 'Target'] = 'in-process'



In [260]:
df_leads["Target"].value_counts()

Target
Cold          12702
in-process     6959
Success        1280
Dead            915
Name: count, dtype: int64

## Drop the cold entries, Assign 1 to Success and 0 to others

In [261]:
# Drop rows where 'Target' is 'Cold'
df_leads = df_leads[df_leads["Target"] != 'Cold']

# Map 'Success' to 1 and all other values to 0
df_leads["Target"] = df_leads["Target"].map({'Success': 1, 'in-process': 0, 'Dead': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_leads["Target"] = df_leads["Target"].map({'Success': 1, 'in-process': 0, 'Dead': 0})


In [262]:
df_leads["Target"].value_counts()

Target
0.0    7874
1.0    1280
Name: count, dtype: int64

## Get Lead to Comm time

In [263]:
# Assuming both columns are already in datetime format
df_leads['lead to comm time'] = (df_leads['communication - creation date'] - df_leads['lead_generation - creation date']).dt.days

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_leads['lead to comm time'] = (df_leads['communication - creation date'] - df_leads['lead_generation - creation date']).dt.days


## Making City Tiers

In [264]:
# Define a mapping dictionary for cities to be merged
city_merge_dict = {
    'Allahabad': 'Prayagraj/Allahabad',
    'Prayagraj': 'Prayagraj/Allahabad',
    'Bengaluru': 'Bengaluru/ Bangalore',
    'Bangalore': 'Bengaluru/ Bangalore',
    'Delhi': 'Delhi_NCR',
    'New Delhi': 'Delhi_NCR',
}

# Replace city names in the 'City' column based on the dictionary
df_leads['City'].replace(city_merge_dict, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_leads['City'].replace(city_merge_dict, inplace=True)


In [265]:
Tier1_cities =  ['Ahmedabad', 'Bengaluru', 'Bangalore', 'Chennai', 'Delhi', 'Hyderabad', 'Kolkata', 'Mumbai', 'Pune']
Tier2_cities =  ["Agra", "Ajmer", "Aligarh", "Amravati", "Amritsar", "Anand", "Asansol", "Aurangabad", "Bareilly", "Belagavi",
                  "Brahmapur", "Bhavnagar", "Bhiwandi", "Bhopal", "Bhubaneswar", "Bikaner", "Bilaspur", "Bokaro Steel City", "Burdwan", 
                  "Chandigarh", "Coimbatore", "Cuttack", "Dahod", "Dehradun", "Dombivli", "Dhanbad", "Bhilai", "Durgapur", "Erode", "Faridabad", "Ghaziabad", "Gorakhpur", "Guntur", "Gurgaon","Gurugram", "Guwahati", "Gwalior", "Hamirpur", "Hubballi–Dharwad", "Indore", "Jabalpur", "Jaipur", "Jalandhar", "Jalgaon", "Jammu", 
                 "Jamshedpur", "Jhansi", "Jodhpur", "Kalaburagi", "Kakinada", "Kannur", "Kanpur", "Karnal", "Kochi", "Kolhapur", "Kollam", "Kota", "Kozhikode", "Kumbakonam", "Kurnool", "Ludhiana", "Lucknow", "Madurai", "Malappuram", "Mathura", "Mangaluru", "Meerut", "Moradabad", "Mysuru", "Nagpur", "Nanded", "Nadiad", "Nashik", "Nellore", "Noida", "Patna", "Puducherry", "Purulia", "Prayagraj", "Raipur", "Rajkot", "Rajamahendravaram", "Ranchi", "Rourkela", "Ratlam", "Saharanpur", "Salem", "Sangli", "Shimla", "Siliguri", "Solapur", "Srinagar", "Surat", "Thanjavur", "Thiruvananthapuram", "Thrissur", "Tiruchirappalli", "Tirunelveli", "Tiruvannamalai", "Ujjain", "Vijayapura", "Vadodara", "Varanasi", "Vasai-Virar", "Vijayawada", "Visakhapatnam", "Vellore", "Warangal"]

In [266]:
from fuzzywuzzy import process


def assign_tier(city):
    # Convert the city to a string
    city = str(city)
    
    # Check match with Tier 1 cities
    if process.extractOne(city, Tier1_cities)[1] >= 85:
        return 3
    # Check match with Tier 2 cities
    elif process.extractOne(city, Tier2_cities)[1] >= 85:
        return 2
    # If not a match in either, assign Tier 3
    else:
        return 1

df_leads['City Tier'] = df_leads['City'].apply(assign_tier)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_leads['City Tier'] = df_leads['City'].apply(assign_tier)


## Source Mapping

In [267]:
source_merge = {
'inhouse website': ['gethomescaped', 'website', 'ampluswebsite', 'contactuspage', 'homescapecontactform', 'atriuminviteform'],
'offline': ['karkardoomakarkardooma', 'suncitysolarmelasuncitysolarmela', 'soalrisesafdargunjsoalrisesafdargunj', 'safdargunjsafdargunj', 'broadsolarhyd', 'broadsolardelhi', 'delhigolfclub', 'keralaexpo', 'acconevent', 'karnalfair', 'tataddl', 'malaysianembassy', 'chdexpo', 'patioclub', 'database'],
'astral': ['astral'],
'ivr': ['ivr', 'freshchat+ivr', 'ivrfranchisepartner'],
'freshchat': ['freshchat'],
'mail': ['mail', 'utmsource=gmaildisplay,utmmedium=gmail,utmcampaign=leadsgmail28052021', 'utmsource=email,utmmedium=sitevisit,utmcampaign=edm1'],
'fb or google': ['lookalikeleadhssep2022', 'lookalikecampaign', 'campaign', 'landingpagegoa', 'landingpagegoogle', 'landingpagefacebook', 'landingpage1', 'landingpage2','landingpage3', 'newlandingpage', 'landingpagefb'],
'fb': ['facebooknewyearoffer2021', 'fbhomescapebyamplusleadgeneration', 'fbad3homescapebyamplusleadgeneration', 'fbad1chdleadgenerationrealproductad', 'facebook', 'facebookmessenger', 'facebookleadgenform1', 'facebookremarketing', 'fbad6ncrleadgen', 'fbad6chdleadgen', 'fbad6leadgen'],
'app': ['app'],
'customerapp': ['customerapp'],
'partnerapp': ['partnerapp'],
'whatsapp': ['whatsapp'],
'franchise': ['franchise'],
'reference': ['reference'],
'': [''],
'referral': ['referral'],
'channelpartner': ['channelpartner'],
'salespartner': ['salespartner'],
'housing': ['housing', 'housing2'],
'bses': ['bses', 'bseslead'],
'acetech': ['acetech'],
'linkedin': ['linkedin'],
'mygate': ['mygatemay2022'],
'third party': ['btl', 'ziffy', 'smscold1'],
  'default': ['default'],
  'justdial': ['justdial'],
'source=solarpanels2': ['source=solarpanels2'],
  'google': ['googlebusiness', 'utmsource=bingsearch,utmmedium=bing,utmcampaign=searchdelhihary14062021'],
'displayresponsive': ['displayresponsive', 'utmsource=displayresponsive,utmmedium=exclusivedesign,utmcampaign='],
'magicbricks': ['magicbricks'],
'instagram': ['igad5whatami', 'ighomescapebyamplusleadgeneration', 'igad6leadgen', 'igad6chdleadgen', 'igad6ncrleadgen', 'igad1chdleadgenerationrealproductad', 'igad3homescapebyamplusleadgeneration', ],
'hubspot': ['hubspot'],
'gujratleadupload': ['gujratleadupload'],
}

In [268]:
# Convert all keys and values in the source_merge dictionary to lowercase and remove spaces
source_merge = {k.lower().replace(" ", ""): [x.lower().replace(" ", "") for x in v] for k, v in source_merge.items()}

# Convert the values in the 'Source' column to lowercase and remove spaces
df_leads['Source'] = df_leads['Source'].str.lower().str.replace(' ', '')

# Function to replace values based on the source_merge dictionary
def replace_with_mapping(value):
    for key, values_list in source_merge.items():
        if value in values_list:
            return key
    return value

# Apply the replacement function to the 'Source' column
df_leads['Source'] = df_leads['Source'].apply(replace_with_mapping)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_leads['Source'] = df_leads['Source'].str.lower().str.replace(' ', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_leads['Source'] = df_leads['Source'].apply(replace_with_mapping)


# Form Name 

In [269]:
pair_frequency = df_leads.groupby(['Form Name', 'Source']).size().reset_index(name='Count')
pair_frequency = pair_frequency.sort_values(by='Count', ascending=False)
unique_sources = pair_frequency['Source'].unique()
null_form_name_pairs = df_leads[df_leads['Form Name'].isnull()].groupby('Source').size().reset_index(name='Count')
unique_null_sources = null_form_name_pairs['Source'].unique()
common_sources = list(set(unique_null_sources) & set(unique_sources))
no_form_sources = list(set(unique_null_sources) - set(common_sources)) #Sources which don't require forms
all_sources = list(df_leads['Source'].unique())
on_form_sum = no_form_sources + common_sources
only_form_sources = list(set(all_sources) - set(on_form_sum))

In [270]:
# Identify the rows where 'Form Name' is null and 'Source' matches any value in no_form_sources
condition = (df_leads['Form Name'].isnull()) & (df_leads['Source'].isin(no_form_sources))

# Replace null values in 'Form Name' for those rows with 'Other'
df_leads.loc[condition, 'Form Name'] = 'Other'


## Cleaning Clusters

In [271]:
# Using a mask to replace the rest of india category
mask = (df_leads['Cluster'] == "Rest_of_India") & (df_leads['State.1'].notnull())
df_leads.loc[mask, 'Cluster'] = df_leads.loc[mask, 'State.1']

## Dealing with null values

In [272]:

df_leads = df_leads.dropna(subset=['Target'])

mode_value = df_leads['Source'].mode()[0]
mode_city = df_leads['City Tier'].mode()[0]
df_leads['Source'].fillna(mode_value, inplace=True)
df_leads['Form Name'].fillna("No_Form", inplace=True)
df_leads['City Tier'].fillna(mode_city, inplace=True)

In [273]:
df_leads['lead to comm time'] = df_leads['lead to comm time'].fillna(df_leads['lead to comm time'].mean())

## Selecting the dataframe

In [274]:
# Selecting specified columns
selected_columns = ['Source', 'Cluster', 'Lead Category', 'Form Name', 'City Tier', 'Target', 'lead to comm time']

# Creating the new DataFrame df_clean with the selected columns
df_clean = df_leads[selected_columns].copy()


# MODEL BUILDING

In [275]:
df_encoded = pd.get_dummies(df_clean, columns=['Source', 'Cluster', 'Lead Category', 'Form Name'])

## Train Test Split

In [276]:
features = df_encoded.copy()
features = features.drop(['Target'], axis=1)
target = df_encoded[['Target']]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42, stratify = target)

## Validation Split and SMOTE

In [277]:
# Creating a validation set from the training data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(sampling_strategy='minority')
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [278]:
# Training the model with SMOTE-enhanced data
gb_model_smote = GradientBoostingClassifier(random_state=42)
gb_model_smote.fit(X_train_smote, y_train_smote)

# Save the model to a file
# joblib.dump(gb_model_smote, 'gb_model_smote.pkl')


  y = column_or_1d(y, warn=True)
