In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score


In [24]:
df=pd.read_csv("Clean_data.csv")

In [25]:
df.columns

Index(['preferred_study_level', 'phd_supervisor', 'supervisor_approval',
       'intended_study_area', 'preferred_dest_1', 'preferred_dest_2',
       'preferred_dest_3', 'study_plan', 'uni_contacted',
       'application_process', 'current_status', 'work_experience',
       'birth_year', 'passport', 'academic_backlogs', 'number_of_backlogs',
       'budget', 'funding_source', 'parents_occupation', 'english_test_taken',
       'english_test_name', 'english_test_score', 'family_abroad',
       'other_consultants', 'lead_converted_flag'],
      dtype='object')

In [26]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is your DataFrame

# Step 1: Preprocessing

# Handle missing values (example: filling with mode for categorical and median for numerical)
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].fillna(df[column].mode()[0])  # Fill categorical columns with mode
    else:
        df[column] = df[column].fillna(df[column].median())   # Fill numerical columns with median

# Encoding categorical variables using Label Encoding
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Step 2: Splitting the Data

# Define input features (X) and target variable (y)
X = df.drop('lead_converted_flag', axis=1)
y = df['lead_converted_flag']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Model Training

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Step 4: Prediction and Evaluation

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[377   1]
 [ 15   0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       378
           1       0.00      0.00      0.00        15

    accuracy                           0.96       393
   macro avg       0.48      0.50      0.49       393
weighted avg       0.93      0.96      0.94       393


Accuracy Score: 0.9592875318066157


In [27]:
X

Unnamed: 0,preferred_study_level,phd_supervisor,supervisor_approval,intended_study_area,preferred_dest_1,preferred_dest_2,preferred_dest_3,study_plan,uni_contacted,application_process,...,academic_backlogs,number_of_backlogs,budget,funding_source,parents_occupation,english_test_taken,english_test_name,english_test_score,family_abroad,other_consultants
0,3,0,0,179,3,3,3,1,0,0,...,0,0,0,4,2,1,2,5.5,1,39
1,3,0,0,179,3,3,3,1,0,0,...,0,0,0,4,2,1,2,5.5,1,39
2,0,1,1,179,4,5,5,1,1,0,...,0,0,1,0,2,1,2,8.0,0,50
3,3,0,0,81,0,1,3,9,0,0,...,0,0,2,3,2,0,3,0.0,1,13
4,4,0,0,179,3,2,1,1,0,0,...,0,0,0,0,2,1,2,6.5,1,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,2,0,0,37,4,5,5,8,0,0,...,0,0,4,4,1,0,3,0.0,0,39
1958,3,0,0,52,4,6,0,2,0,0,...,0,0,4,4,0,0,3,0.0,0,39
1959,2,0,0,37,4,0,1,1,0,0,...,0,0,4,4,2,1,2,7.0,1,39
1960,2,0,0,37,1,1,1,1,0,0,...,0,0,2,5,2,0,3,0.0,1,39


In [28]:
df

Unnamed: 0,preferred_study_level,phd_supervisor,supervisor_approval,intended_study_area,preferred_dest_1,preferred_dest_2,preferred_dest_3,study_plan,uni_contacted,application_process,...,number_of_backlogs,budget,funding_source,parents_occupation,english_test_taken,english_test_name,english_test_score,family_abroad,other_consultants,lead_converted_flag
0,3,0,0,179,3,3,3,1,0,0,...,0,0,4,2,1,2,5.5,1,39,0
1,3,0,0,179,3,3,3,1,0,0,...,0,0,4,2,1,2,5.5,1,39,0
2,0,1,1,179,4,5,5,1,1,0,...,0,1,0,2,1,2,8.0,0,50,1
3,3,0,0,81,0,1,3,9,0,0,...,0,2,3,2,0,3,0.0,1,13,0
4,4,0,0,179,3,2,1,1,0,0,...,0,0,0,2,1,2,6.5,1,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1957,2,0,0,37,4,5,5,8,0,0,...,0,4,4,1,0,3,0.0,0,39,0
1958,3,0,0,52,4,6,0,2,0,0,...,0,4,4,0,0,3,0.0,0,39,0
1959,2,0,0,37,4,0,1,1,0,0,...,0,4,4,2,1,2,7.0,1,39,0
1960,2,0,0,37,1,1,1,1,0,0,...,0,2,5,2,0,3,0.0,1,39,0


In [29]:
import pickle 
# Save the model to disk
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_classifier, file)

print("Model saved to 'random_forest_model.pkl'.")

# Step 5: Load the Model and Predict New Data

# Load the model from disk
with open('random_forest_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

Model saved to 'random_forest_model.pkl'.


In [30]:
new_data = ['Undergraduate', 'No', 'No', 'Other', 'New Zealand', 'New Zealand', 'New Zealand', '01-2021', 'No', 'No', 'Student', '0', 1997, 'Yes', 'No', '0', '10-20', 'Parents asset', 'Own business', 'Yes', 'IELTS', 5.5, 'Yes', 'No']

# Step 6: Encode and Prepare Input Data for Prediction
input_data = pd.DataFrame([new_data], columns=X.columns)

# Encode the input data using the saved label encoders
for column in input_data.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        input_data[column] = label_encoders[column].transform(input_data[column])

# Step 7: Predict the Outcome
prediction = loaded_model.predict(input_data)

print("Predicted lead_converted_flag:", prediction[0])

Predicted lead_converted_flag: 0
