In [57]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix
import tensorflow as tf


In [58]:
dataset1 = pd.read_csv('Leads-data.csv').drop(columns=['Agent_id']).iloc[:,[0,1,3,4,12]]
dataset1 = dataset1.dropna(subset=["budget","duration"])
indx = dataset1[dataset1["status"] == 'OPPORTUNITY'].index.to_list()
indx += dataset1[dataset1["status"] == 'CONTACTED'].index.to_list()
indx += dataset1[dataset1["status"] == 'PROCESSING'].index.to_list()
indx += dataset1[dataset1["status"] == 'IMPORTANT'].index.to_list()
dataset1 = dataset1.drop(indx)

In [59]:
#duration preprocess
duration_list = dataset1['duration'].to_numpy()
dl = []
for z in range(len(duration_list)):
    i = duration_list[z]
    i = i.lower()
    y = re.findall(r'\d+',i)
    try:
        last = y.pop()
    except:
        last = 1
    if(bool(re.search("week",i))):
        dl.append(int(last))
    elif(bool(re.search("month",i))):
        dl.append(int(last*4))
    elif(bool(re.search("year",i))):
        dl.append(int(last*52))
    elif(bool(re.search("sem",i))):
        dl.append(int(26))
    else:
        try:
            i = int(i)
            if(i == 0):
                dl.append(np.nan)
                continue
            dl.append(i)
            
        except:
            #drop these rows
            dl.append(np.nan)
    

In [60]:
#budget pre process
budget_list = dataset1["budget"].values
bl = []
for i in budget_list:
    if(pd.isna(i)):
        bl.append(i)
        continue
    y = re.findall(r'\d+',i)
    try:
        last = y.pop()
        if(int(last) == 0):
            bl.append(np.nan)
            continue
    except:
        #drop row
        pass
    bl.append(int(last))

In [61]:
room_type_list = dataset1["room_type"].to_numpy()
rt = []
keys = {'Ensuite':1, np.nan:0, 'Entire Place':3, 'Studio':2, 'Twin-Studio':5,'Non-Ensuite':4}
for i in range(0 , len(room_type_list)):
    z = room_type_list[i]
    rt.append(keys[z])

In [62]:
status_list = dataset1["status"].to_numpy()
st = []
keys = {"LOST":0,"WON":1}
for i in range(0 , len(status_list)):
    z = status_list[i]
    st.append(keys[z])

In [63]:
data = {"status":st,"budget":bl,"duration":dl,"room_type":rt}
data_frame = pd.DataFrame(data)
data_frame.dropna(inplace = True)


In [64]:
x_data = data_frame.iloc[:,1:].values
y_data = data_frame.iloc[:,0].values.reshape(-1,1)

In [65]:
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size = 0.1,random_state=42)

In [66]:
sc = StandardScaler()
x_train[:,0:2] = sc.fit_transform(x_train[:,0:2])
x_test[:,0:2] = sc.transform(x_test[:,0:2])

mms = MinMaxScaler()
x_train[:,2] = mms.fit_transform(x_train[:,2].reshape(-1,1)).reshape(1,-1)
x_test[:,2] = mms.transform(x_test[:,2].reshape(-1,1)).reshape(1,-1)

In [67]:
# from sklearn.utils import resample
# train = np.concatenate([x_train,y_train],axis=1)
# train = pd.DataFrame(train)
# zeros = train[train[3] == 0]
# ones = train[train[3] == 1]
# ones_up = resample(ones, replace=True, n_samples=len(zeros))
# ones_up
# Upsampling block for class imbalances

In [68]:
# train = np.concatenate([x_train,y_train],axis=1)
# train = pd.DataFrame(train)
# zeros = train[train[3] == 0]
# ones = train[train[3] == 1]
# zeros_down = resample(zeros, replace=True, n_samples=len(ones))
# zeros_down
# Downsampling block for class imbalances

In [69]:
# train_down = np.concatenate([zeros_down,ones])
# np.random.shuffle(train_down)
# new2_xtrain = train_down[:,:-1]
# new2_ytrain = train_down[:,-1]
# Combining the downsampled data and shuffling it

In [79]:
ann  = tf.keras.models.Sequential([tf.keras.layers.Dense(128,activation = "relu"),
                                   tf.keras.layers.Dense(64,activation = "relu"),
                                   tf.keras.layers.Dense(64,activation = "relu"),
                                   tf.keras.layers.Dense(32,activation = "relu"),
                                   tf.keras.layers.Dense(1,activation = "sigmoid")])


In [80]:
ann.compile(loss = "mse",optimizer = "adam",metrics = ["accuracy"])

In [81]:
class_weights = {0:(28171*1.115)/25824,1:(28171)/2347}

history = ann.fit(np.asarray(x_train[:,:]).astype(np.float32),np.asarray(y_train).astype(np.float32),epochs = 10,class_weight = class_weights)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [82]:
y_pred = ann.predict(np.asarray(x_test).astype(np.float32))
y_pred = np.squeeze(np.round(y_pred).astype(int).reshape(1,-1))
confusion_matrix(np.squeeze(y_test.reshape(1,-1)),y_pred)

array([[2360,  508],
       [ 170,   93]], dtype=int64)

In [77]:
ann2 = tf.keras.models.load_model("best.h5")
y_pred = ann2.predict(np.asarray(x_test).astype(np.float32))
y_pred = np.squeeze(np.round(y_pred).astype(int).reshape(1,-1))
accuracy_score(np.squeeze(y_test.reshape(1,-1)),y_pred),f1_score(np.squeeze(y_test.reshape(1,-1)),y_pred)

(0.7719578409453849, 0.2205240174672489)

In [None]:
from joblib import dump, load
dump(sc, 'std_scaler.bin', compress=True)
dump(mms, 'mms_scaler.bin', compress=True)

['mms_scaler.bin']

In [None]:
sc=load('std_scaler.bin')