In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
#from sklearn import datasets, neighbors
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions # used to plot the decision boundary of ml algorithms
from sklearn.model_selection import cross_val_score # import all the functions reqd for cross validation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
import seaborn as sns

# Data 

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head(5)

Unnamed: 0,age,job,marital,education_qual,call_type,day,mon,dur,num_calls,prev_outcome,y
0,58,management,married,tertiary,unknown,5,may,261,1,unknown,no
1,44,technician,single,secondary,unknown,5,may,151,1,unknown,no
2,33,entrepreneur,married,secondary,unknown,5,may,76,1,unknown,no
3,47,blue-collar,married,unknown,unknown,5,may,92,1,unknown,no
4,33,unknown,single,unknown,unknown,5,may,198,1,unknown,no


In [4]:
data.shape

(45211, 11)

In [5]:
data.columns

Index(['age', 'job', 'marital', 'education_qual', 'call_type', 'day', 'mon',
       'dur', 'num_calls', 'prev_outcome', 'y'],
      dtype='object')

#### y encoded

In [6]:
data.y = data.y.map({'yes':1,'no':0})

### All datatypes are checked

In [7]:
data.dtypes

age                int64
job               object
marital           object
education_qual    object
call_type         object
day                int64
mon               object
dur                int64
num_calls          int64
prev_outcome      object
y                  int64
dtype: object

#### Duplicates deleted

In [8]:
data.shape

(45211, 11)

In [9]:
data = data.drop_duplicates()

In [10]:
data.shape

(45205, 11)

#### Checking for nan values

In [11]:
data.isnull().sum()

age               0
job               0
marital           0
education_qual    0
call_type         0
day               0
mon               0
dur               0
num_calls         0
prev_outcome      0
y                 0
dtype: int64

### Data Cleaning

### No need to check for nan and remove duplicates, already did 

# 1)age

In [13]:
data

Unnamed: 0,age,job,marital,education_qual,call_type,day,mon,dur,num_calls,prev_outcome,y
0,58,management,married,tertiary,unknown,5,may,261,1,unknown,0
1,44,technician,single,secondary,unknown,5,may,151,1,unknown,0
2,33,entrepreneur,married,secondary,unknown,5,may,76,1,unknown,0
3,47,blue-collar,married,unknown,unknown,5,may,92,1,unknown,0
4,33,unknown,single,unknown,unknown,5,may,198,1,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,cellular,17,nov,977,3,unknown,1
45207,71,retired,divorced,primary,cellular,17,nov,456,2,unknown,1
45208,72,retired,married,secondary,cellular,17,nov,1127,5,success,1
45209,57,blue-collar,married,secondary,telephone,17,nov,508,4,unknown,0


In [14]:
data.age.dtype

dtype('int64')

In [15]:
data.age.describe()

count    45205.000000
mean        40.937087
std         10.619130
min         18.000000
25%         33.000000
50%         39.000000
75%         48.000000
max         95.000000
Name: age, dtype: float64

### No need to clip the outliers

In [None]:
#iqr = data['age'].quantile(0.75) - data['age'].quantile(0.25)
#upper_threshold = data['age'].quantile(0.75) + (1.5 * iqr)
#lower_threshold = data['age'].quantile(0.25) - (1.5 * iqr)
#print('UT  -',round(upper_threshold,3),'LT  -', round(lower_threshold,3))
#print('Max -',round(data['age'].max(),3),'Min -',round(data['age'].min(),3))

In [None]:
#data.age = data.age.clip(min(data.age), upper_threshold)
#data.age.describe()

# 2)job

In [16]:
data.head(5)

Unnamed: 0,age,job,marital,education_qual,call_type,day,mon,dur,num_calls,prev_outcome,y
0,58,management,married,tertiary,unknown,5,may,261,1,unknown,0
1,44,technician,single,secondary,unknown,5,may,151,1,unknown,0
2,33,entrepreneur,married,secondary,unknown,5,may,76,1,unknown,0
3,47,blue-collar,married,unknown,unknown,5,may,92,1,unknown,0
4,33,unknown,single,unknown,unknown,5,may,198,1,unknown,0


In [17]:
job = list(data.job.unique())

In [21]:
len(data.loc[data.job=='unknown'])/len(data)

0.0063709766618736865

In [22]:
job_suc={}
for i in job:
    job_suc.update({i : (len(data.loc[(data.job==i) & (data.y==1)]) / len(data.loc[(data.job==i)]))*100}) 

In [23]:
job_suc

{'management': 13.757005392830706,
 'technician': 11.058451816745656,
 'entrepreneur': 8.271687962340282,
 'blue-collar': 7.276464542651594,
 'unknown': 11.805555555555555,
 'retired': 22.791519434628977,
 'admin.': 12.205029013539653,
 'services': 8.885143269925354,
 'self-employed': 11.842938568714375,
 'unemployed': 15.502686108979278,
 'housemaid': 8.790322580645162,
 'student': 28.678038379530918}

In [24]:
job_suc = pd.Series(job_suc)

In [25]:
job_suc = job_suc.sort_values()

In [26]:
job_suc

blue-collar       7.276465
entrepreneur      8.271688
housemaid         8.790323
services          8.885143
technician       11.058452
unknown          11.805556
self-employed    11.842939
admin.           12.205029
management       13.757005
unemployed       15.502686
retired          22.791519
student          28.678038
dtype: float64

In [27]:
rankjob = {x: i for i, x in enumerate(job_suc.index)}

In [28]:
rankjob

{'blue-collar': 0,
 'entrepreneur': 1,
 'housemaid': 2,
 'services': 3,
 'technician': 4,
 'unknown': 5,
 'self-employed': 6,
 'admin.': 7,
 'management': 8,
 'unemployed': 9,
 'retired': 10,
 'student': 11}

In [29]:
data['job'] = data['job'].map(rankjob)

In [30]:
data

Unnamed: 0,age,job,marital,education_qual,call_type,day,mon,dur,num_calls,prev_outcome,y
0,58,8,married,tertiary,unknown,5,may,261,1,unknown,0
1,44,4,single,secondary,unknown,5,may,151,1,unknown,0
2,33,1,married,secondary,unknown,5,may,76,1,unknown,0
3,47,0,married,unknown,unknown,5,may,92,1,unknown,0
4,33,5,single,unknown,unknown,5,may,198,1,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...
45206,51,4,married,tertiary,cellular,17,nov,977,3,unknown,1
45207,71,10,divorced,primary,cellular,17,nov,456,2,unknown,1
45208,72,10,married,secondary,cellular,17,nov,1127,5,success,1
45209,57,0,married,secondary,telephone,17,nov,508,4,unknown,0


### 3)marital

In [32]:
data

Unnamed: 0,age,job,marital,education_qual,call_type,day,mon,dur,num_calls,prev_outcome,y
0,58,8,married,tertiary,unknown,5,may,261,1,unknown,0
1,44,4,single,secondary,unknown,5,may,151,1,unknown,0
2,33,1,married,secondary,unknown,5,may,76,1,unknown,0
3,47,0,married,unknown,unknown,5,may,92,1,unknown,0
4,33,5,single,unknown,unknown,5,may,198,1,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...
45206,51,4,married,tertiary,cellular,17,nov,977,3,unknown,1
45207,71,10,divorced,primary,cellular,17,nov,456,2,unknown,1
45208,72,10,married,secondary,cellular,17,nov,1127,5,success,1
45209,57,0,married,secondary,telephone,17,nov,508,4,unknown,0


In [33]:
mar = list(data.marital.unique())

In [34]:
mar

['married', 'single', 'divorced']

In [35]:
mar_suc={}
for i in mar:
    mar_suc.update({i : (len(data.loc[(data.marital==i) & (data.y==1)]) / len(data.loc[(data.marital==i)]))*100})

In [36]:
mar_suc

{'married': 10.124954061006983,
 'single': 14.95151704723178,
 'divorced': 11.945458037257538}

In [37]:
mar_suc = pd.Series(mar_suc)

In [38]:
mar_suc = mar_suc.sort_values()

In [39]:
mar_suc

married     10.124954
divorced    11.945458
single      14.951517
dtype: float64

In [40]:
rankmar = {x: i for i, x in enumerate(mar_suc.index)}

In [41]:
rankmar

{'married': 0, 'divorced': 1, 'single': 2}

In [42]:
data['marital'] = data['marital'].map(rankmar)

In [43]:
data

Unnamed: 0,age,job,marital,education_qual,call_type,day,mon,dur,num_calls,prev_outcome,y
0,58,8,0,tertiary,unknown,5,may,261,1,unknown,0
1,44,4,2,secondary,unknown,5,may,151,1,unknown,0
2,33,1,0,secondary,unknown,5,may,76,1,unknown,0
3,47,0,0,unknown,unknown,5,may,92,1,unknown,0
4,33,5,2,unknown,unknown,5,may,198,1,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...
45206,51,4,0,tertiary,cellular,17,nov,977,3,unknown,1
45207,71,10,1,primary,cellular,17,nov,456,2,unknown,1
45208,72,10,0,secondary,cellular,17,nov,1127,5,success,1
45209,57,0,0,secondary,telephone,17,nov,508,4,unknown,0


### 4)education_qual

In [None]:
data.head(5)

In [None]:
edu = list(data.education_qual.unique())

In [None]:
edu_suc={}
for i in edu:
    edu_suc.update({i : (len(data.loc[(data.education_qual==i) & (data.y==1)]) / len(data.loc[(data.education_qual==i)]))*100})

In [None]:
edu_suc

In [None]:
edu_suc = pd.Series(edu_suc)

In [None]:
edu_suc

In [None]:
edu_suc = edu_suc.sort_values()

In [None]:
edu_suc

In [None]:
rankedu = {x: i for i, x in enumerate(edu_suc.index)}

In [None]:
rankedu

In [None]:
data.education_qual =data.education_qual.map(rankedu)

In [None]:
data

### 5)call_type

In [None]:
data.head(5)

In [None]:
ct = list(data.call_type.unique())

In [None]:
ct

In [None]:
ct_suc={}
for i in ct:
    ct_suc.update({i : (len(data.loc[(data.call_type==i) & (data.y==1)]) / len(data.loc[(data.call_type==i)]))*100})

In [None]:
ct_suc

In [None]:
ct_suc = pd.Series(ct_suc)

In [None]:
ct_suc = ct_suc.sort_values()

In [None]:
ct_suc

In [None]:
ctrank = {x: i for i, x in enumerate(ct_suc.index)}

In [None]:
ctrank

In [None]:
data.call_type =data.call_type.map(ctrank)

### 6) day

In [31]:
data.day.unique()

array([ 5,  6,  7,  8,  9, 12, 13, 14, 15, 16, 19, 20, 21, 23, 26, 27, 28,
       29, 30,  2,  3,  4, 11, 17, 18, 24, 25,  1, 10, 22, 31],
      dtype=int64)

In [45]:
da = list(data.day.unique())

In [None]:
da

In [47]:
da_suc={}
for i in da:
    da_suc.update({i : (len(data.loc[(data.day==i) & (data.y==1)]) / len(data.loc[(data.day==i)]))*100})

In [48]:
da_suc

{5: 11.2565445026178,
 6: 9.368530020703934,
 7: 8.640616400660429,
 8: 10.92391304347826,
 9: 11.474358974358974,
 12: 15.221459762944479,
 13: 15.205047318611985,
 14: 11.363636363636363,
 15: 13.975337639459775,
 16: 13.568904593639575,
 19: 6.947608200455581,
 20: 6.976744186046512,
 21: 9.921026653504441,
 23: 13.418530351437699,
 26: 11.207729468599034,
 27: 13.380909901873327,
 28: 7.818480043739748,
 29: 7.392550143266476,
 30: 17.305236270753515,
 2: 14.086687306501547,
 3: 16.4967562557924,
 4: 15.916955017301039,
 11: 12.237998647734956,
 17: 9.076843733883445,
 18: 9.878682842287695,
 24: 13.870246085011187,
 25: 15.833333333333332,
 1: 27.95031055900621,
 10: 23.091603053435115,
 22: 17.016574585635357,
 31: 7.153965785381026}

In [49]:
da_suc = pd.Series(da_suc)

In [50]:
da_suc = da_suc.sort_values()

In [51]:
da_suc

19     6.947608
20     6.976744
31     7.153966
29     7.392550
28     7.818480
7      8.640616
17     9.076844
6      9.368530
18     9.878683
21     9.921027
8     10.923913
26    11.207729
5     11.256545
14    11.363636
9     11.474359
11    12.237999
27    13.380910
23    13.418530
16    13.568905
24    13.870246
15    13.975338
2     14.086687
13    15.205047
12    15.221460
25    15.833333
4     15.916955
3     16.496756
22    17.016575
30    17.305236
10    23.091603
1     27.950311
dtype: float64

In [52]:
darank = {x: i for i, x in enumerate(da_suc.index)}

In [53]:
darank

{19: 0,
 20: 1,
 31: 2,
 29: 3,
 28: 4,
 7: 5,
 17: 6,
 6: 7,
 18: 8,
 21: 9,
 8: 10,
 26: 11,
 5: 12,
 14: 13,
 9: 14,
 11: 15,
 27: 16,
 23: 17,
 16: 18,
 24: 19,
 15: 20,
 2: 21,
 13: 22,
 12: 23,
 25: 24,
 4: 25,
 3: 26,
 22: 27,
 30: 28,
 10: 29,
 1: 30}

In [54]:
data.day =data.day.map(darank)

In [55]:
data.head()

Unnamed: 0,age,job,marital,education_qual,call_type,day,mon,dur,num_calls,prev_outcome,y
0,58,8,0,tertiary,unknown,12,may,261,1,unknown,0
1,44,4,2,secondary,unknown,12,may,151,1,unknown,0
2,33,1,0,secondary,unknown,12,may,76,1,unknown,0
3,47,0,0,unknown,unknown,12,may,92,1,unknown,0
4,33,5,2,unknown,unknown,12,may,198,1,unknown,0


### 7)mon 

In [None]:
data.head(5)

In [None]:
mas = list(data.mon.unique())

In [None]:
mas

In [None]:
m_suc={}
for i in mas:
    m_suc.update({i : (len(data.loc[(data.mon==i) & (data.y==1)]) / len(data.loc[(data.mon==i)]))*100})

In [None]:
m_suc

In [None]:
m_suc = pd.Series(m_suc)

In [None]:
m_suc = m_suc.sort_values()

In [None]:
m_suc

In [None]:
mrank = {x: i for i, x in enumerate(m_suc.index)}

In [None]:
mrank

In [None]:
data.mon =data.mon.map(mrank)

In [None]:
data

### 8)dur

### Clipping the outliers 

In [None]:
iqr = data['dur'].quantile(0.75) - data['dur'].quantile(0.25)
upper_threshold = data['dur'].quantile(0.75) + (1.5 * iqr)
lower_threshold = data['dur'].quantile(0.25) - (1.5 * iqr)
print('UT  -',round(upper_threshold,3),'LT  -', round(lower_threshold,3))
print('Max -',round(data['dur'].max(),3),'Min -',round(data['dur'].min(),3))

In [None]:
data.dur.describe()

In [None]:
data.dur = data.dur.clip(min(data.dur), upper_threshold)
data.dur.describe()

### 9)num_calls

In [None]:
data.num_calls.unique()

In [None]:
data.num_calls.isnull().sum()

### Clipping the outliers

In [None]:
iqr = data['num_calls'].quantile(0.75) - data['num_calls'].quantile(0.25)
upper_threshold = data['num_calls'].quantile(0.75) + (1.5 * iqr)
lower_threshold = data['num_calls'].quantile(0.25) - (1.5 * iqr)
print('UT  -',round(upper_threshold,3),'LT  -', round(lower_threshold,3))
print('Max -',round(data['num_calls'].max(),3),'Min -',round(data['num_calls'].min(),3))

In [None]:
data.num_calls.describe()

In [None]:
data.num_calls = data.num_calls.clip(min(data.num_calls), upper_threshold)
data.num_calls.describe()

### 9)prev_outcome

In [None]:
pre = list(data.prev_outcome.unique())

In [None]:
pre

In [None]:
p_suc={}
for i in pre:
    p_suc.update({i : (len(data.loc[(data.prev_outcome==i) & (data.y==1)]) / len(data.loc[(data.prev_outcome==i)]))*100})

In [None]:
p_suc

In [None]:
p_suc = pd.Series(p_suc)

In [None]:
p_suc = p_suc.sort_values()

In [None]:
p_suc

In [None]:
prank = {x: i for i, x in enumerate(p_suc.index)}

In [None]:
prank

In [None]:
data.prev_outcome =data.prev_outcome.map(prank)

In [None]:
data

# Preprocessing

### y-target

In [None]:
y = list(data.y.unique())

In [None]:
y

In [None]:
fail = len(data.loc[data.y==0])*100/len(data)
sux = len(data.loc[data.y==1])*100/len(data)

In [None]:
fail+sux

In [None]:
imb = [fail,sux]

In [None]:
imb

In [None]:
imb = pd.DataFrame(imb, columns=['% Split'], index = ['Fail','Success'])

In [None]:
imb.plot(kind='bar', title='Count (target)');

In [None]:
imb

In [None]:
X = data.iloc[:,:10]

In [None]:
len(X.columns)

In [None]:
X.columns

In [None]:
y = data.loc[:,'y']

In [None]:
y

In [None]:
len(y[y==0])*100/len(y),len(y[y==1])*100/len(y)

In [None]:
len(y[y==0])+len(y[y==1])-len(y)

In [None]:
import imblearn

In [None]:
from imblearn.under_sampling import ClusterCentroids

In [None]:
cc = ClusterCentroids(sampling_strategy={0: 10000})

In [None]:
X_cc, y_cc = cc.fit_resample(X, y)

In [None]:
len(X_cc), len(y_cc)

In [None]:
len(y_cc.loc[y_cc==0]),len(y_cc.loc[y_cc==1])

In [None]:
len(y_cc.loc[y_cc==0])+len(y_cc.loc[y_cc==1]), len(y_cc)

In [None]:
len(y_cc.loc[y_cc==0])*100/len(y_cc),len(y_cc.loc[y_cc==1])*100/len(y_cc)

In [None]:
X_cc = pd.DataFrame(X_cc)

In [None]:
y_cc = pd.DataFrame(y_cc)

In [None]:
un = pd.concat([X_cc,y_cc],axis=1)

In [None]:
len(un)

In [None]:
un.head()

## Performing - SMOTEENN

In [None]:
from imblearn.combine import SMOTEENN


smt = SMOTEENN(sampling_strategy='all')
X_smt, y_smt = smt.fit_resample(X, y)



In [None]:
len(X_smt),len(y_smt)

In [None]:
y_smt.shape,X_smt.shape

In [None]:
df = pd.concat([X_smt,y_smt],axis=1)

In [None]:
df

In [None]:
l0=df[['age','dur']].values
k0=df['y'].values

In [None]:
l1=data[['age','dur']].values
k1=data['y'].values

In [None]:
plot_2d_space(l1,k1, 'Original Plot')
plot_2d_space(l0,k0, 'SMOTE & ENN')

In [None]:
fail = len(df.loc[df.y==0])*100/len(df)
sux = len(df.loc[df.y==1])*100/len(df)

In [None]:
fail+sux,fail,sux

In [None]:
imb = [fail,sux]
imb = pd.DataFrame(imb, columns=['% Split'], index = ['Fail','Success'])
imb.plot(kind='bar', title='Count (target)')

# Will use SMOTE to balance it more precisely

In [None]:
from imblearn.over_sampling import SMOTE


smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X_smt, y_smt)


In [None]:
len(X_sm),len(y_sm)

In [None]:
dc = pd.concat([X_sm,y_sm], axis = 1)

In [None]:
len(dc.loc[dc.y==0]),len(dc.loc[dc.y==1])

In [None]:
l2=dc[['age','dur']].values
k2=dc['y'].values

In [None]:
plot_2d_space(l1,k1, 'Original data')
plot_2d_space(l2,k2, 'SMOTE Over-Sampling')


In [None]:
fail = len(dc.loc[dc.y==0])*100/len(dc)
sux = len(dc.loc[dc.y==1])*100/len(dc)

In [None]:
imb = [fail,sux]
imb = pd.DataFrame(imb, columns=['% Split'], index = ['Fail','Success'])
imb.plot(kind='bar', title='Count (target)')

### Splitting Data

In [None]:
X = dc.iloc[:,:10] #array of features
y = dc.iloc[:,-1] #array of targets

In [None]:
len(X),len(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [None]:
len(X_train),len(X_test)

### Scaling 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(X_train,y_train)

### Training Data - Visualization

In [None]:
y_train=y_train.values

In [None]:
pred_1=log.predict(X_test)

In [None]:
y_test=y_test.values

In [None]:
pred_test = pd.DataFrame({'target': y_test, 'prediction' : pred_1})

In [None]:
pred_test

## Calculating Loss Function

F1 Score

In [None]:
from sklearn.metrics import f1_score

# Assuming you have predicted labels stored in y_pred and true labels in y_true
f1 = f1_score(pred_test.target, pred_test.prediction)

print("F1 score:", f1)
