In [68]:
import pandas as pd
import numpy as np

In [69]:
clients = pd.read_excel('client.xlsx')
clients.head()

Unnamed: 0,CLIENT CODE,GENDER,OWNS CAR,OCCUPATION,AVERAGE PURCHASES,NUMBER OF REFERRALS,FREQUENT CUSTOMER
0,C001,MALE,YES,BUSINESSPERSON,1984,1.0,NO
1,C002,FEMALE,NO,ACCOUNTANT,2247,,NO
2,C003,MALE,YES,HIGHSCHOOL TEACHER,2407,,NO
3,C004,FEMALE,YES,SECURITY PERSONNEL,3366,0.0,YES
4,C005,FEMALE,YES,,1583,9.0,YES


In [70]:
clients.describe()

Unnamed: 0,AVERAGE PURCHASES,NUMBER OF REFERRALS
count,118.0,116.0
mean,2297.118644,4.75
std,719.509094,3.134312
min,1027.0,0.0
25%,1739.5,2.0
50%,2281.0,4.0
75%,2872.0,8.0
max,3479.0,10.0


Extracting the independent variables    

In [71]:
X = clients[['GENDER', 'OWNS CAR', 'OCCUPATION', 'AVERAGE PURCHASES', 'NUMBER OF REFERRALS']].values
X[:, 0:3]

array([['MALE', 'YES', 'BUSINESSPERSON'],
       ['FEMALE', 'NO', 'ACCOUNTANT'],
       ['MALE', 'YES', 'HIGHSCHOOL TEACHER'],
       ['FEMALE', 'YES', 'SECURITY PERSONNEL'],
       ['FEMALE', 'YES', nan],
       ['MALE', 'YES', nan],
       ['MALE', 'NO', 'HIGHSCHOOL TEACHER'],
       ['FEMALE', 'YES', 'BUSINESSPERSON'],
       ['MALE', 'NO', 'SOCIAL WORKER'],
       ['FEMALE', 'YES', 'GRADUATE INTERN'],
       ['FEMALE', 'YES', 'FINANCIAL EXPERT'],
       ['MALE', 'YES', 'DAY TRADER'],
       ['MALE', 'YES', 'DAY TRADER'],
       ['FEMALE', 'NO', 'LINGERIE MODEL'],
       ['MALE', 'YES', 'BUSINESSPERSON'],
       ['FEMALE', 'NO', 'ACCOUNTANT'],
       ['FEMALE', 'YES', 'HIGHSCHOOL TEACHER'],
       ['MALE', 'YES', 'SECURITY PERSONNEL'],
       ['MALE', 'YES', 'ACCOUNTANT'],
       ['FEMALE', 'YES', 'HIGHSCHOOL TEACHER'],
       ['MALE', 'NO', 'HIGHSCHOOL TEACHER'],
       ['FEMALE', 'YES', 'BUSINESSPERSON'],
       ['FEMALE', 'NO', 'SOCIAL WORKER'],
       ['MALE', 'YES', 'GRADUATE I

Extracting dependent variable

In [72]:
y = clients.iloc[:, 6]
y.shape

(118,)

In [73]:
clients.isnull().sum()

CLIENT CODE            0
GENDER                 0
OWNS CAR               0
OCCUPATION             2
AVERAGE PURCHASES      0
NUMBER OF REFERRALS    2
FREQUENT CUSTOMER      0
dtype: int64

Dealing with null values/missing values

In [74]:
from sklearn.impute import SimpleImputer

referrals_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
occupation_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

referrals = referrals_imputer.fit(X[:, 4:])
X[:, 4:] = referrals.transform(X[:, 4:])

occupations = occupation_imputer.fit(X[:, 2:3])
X[:, 2:3] = occupations.transform(X[:, 2:3])



Dealing with categorical data/encoding categorical data

In [75]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder = LabelEncoder()

X[:, 0] = encoder.fit_transform(X[:, 0])
X[:, 2] = encoder.fit_transform(X[:, 2])
X[:, 1] = encoder.fit_transform(X[:, 1])
y = encoder.fit_transform(y)

# using the normal encoder the model might assume that the labels have some mathematical meaning, encode accountant to 3 and teacher to 4 might mean that teacher > accountant
# onehotencoder is used to correct for this problem


In [76]:
# using the onehotencoder
from sklearn.preprocessing import OneHotEncoder

onehotencoder = OneHotEncoder()

onehotencoder.fit_transform(clients['OCCUPATION'].values.reshape(-1, 1)).toarray()


array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

Feature scaling

In [77]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


Splitting the datasets into training and testing datasets

In [78]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train[0]

array([0, 0, 6, 1207, 3.0], dtype=object)

In [79]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [80]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)


In [81]:
# testing results

from sklearn import metrics

predictions = model.predict(X_test)

confusion_matrix_results = metrics.confusion_matrix(y_test, predictions)
model_score = model.score(X_test, y_test)

model_score

0.8333333333333334