In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

social_N_data = pd.read_csv('Social_Network_Ads.csv')
pd.concat([social_N_data.head(), social_N_data.tail()])

#CHECK FOR NULL VALUES
social_N_data.isnull().any()

# CLEAN THE DATA
social_N_data.drop('User ID', axis=1, inplace=True)

# CHANGE CATEGORICAL VARIABLE TO DUMMIES
social_N_data.info()
gender = pd.get_dummies(social_N_data['Gender'], drop_first=True)
social_N_data.drop('Gender',axis=1,inplace=True)
social_N_data = pd.concat([social_N_data,gender], axis=1)

# SPLIT DATA TO INDEPENDENT AND DEPENDENT VARIABLES
X = social_N_data.iloc[:,[0,1,3]] # Age, EstimatedSalary and Male
y = social_N_data.iloc[:, 2] # Purchased


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Gender           400 non-null    object
 1   Age              400 non-null    int64 
 2   EstimatedSalary  400 non-null    int64 
 3   Purchased        400 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 12.6+ KB


In [26]:
X

Unnamed: 0,Age,EstimatedSalary,Male
0,19,19000,True
1,35,20000,True
2,26,43000,False
3,27,57000,False
4,19,76000,True
...,...,...,...
395,46,41000,False
396,51,23000,True
397,50,20000,False
398,36,33000,True


In [27]:
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

In [28]:
sc = StandardScaler()
X = sc.fit_transform(X)

# SPLIT DATA TO TRAIN AND TEST SET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)


In [29]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(360, 3)
(40, 3)
(360,)
(40,)


In [30]:
X_train

array([[ 0.31949142, -0.72655996,  1.02020406],
       [ 0.70154394, -0.72655996, -0.98019606],
       [-1.30423178, -1.49004624,  1.02020406],
       ...,
       [-0.82666613,  0.30121002,  1.02020406],
       [ 0.12846516, -0.25672226, -0.98019606],
       [ 0.41500455,  1.123426  , -0.98019606]])

In [31]:
from sklearn.linear_model import LogisticRegression  
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [35]:
# (Hệ số tự do của biến độc lập):
print(classifier.intercept_)

# (Hệ số góc của biến độc lập):
print(classifier.coef_)

[-1.03526656]
[[2.15200421 1.08592164 0.15798615]]


In [36]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,

In [37]:
print(classifier.predict_proba(X_train))

[[0.72622012 0.27377988]
 [0.61523834 0.38476166]
 [0.99502763 0.00497237]
 [0.95726116 0.04273884]
 [0.85135962 0.14864038]
 [0.95548178 0.04451822]
 [0.04809558 0.95190442]
 [0.814917   0.185083  ]
 [0.73059929 0.26940071]
 [0.98173906 0.01826094]
 [0.7613964  0.2386036 ]
 [0.43447035 0.56552965]
 [0.4747357  0.5252643 ]
 [0.78850453 0.21149547]
 [0.69654861 0.30345139]
 [0.68509651 0.31490349]
 [0.13030771 0.86969229]
 [0.94837446 0.05162554]
 [0.32164738 0.67835262]
 [0.00287366 0.99712634]
 [0.94906612 0.05093388]
 [0.14510353 0.85489647]
 [0.80231638 0.19768362]
 [0.20437068 0.79562932]
 [0.01151502 0.98848498]
 [0.28275441 0.71724559]
 [0.53300828 0.46699172]
 [0.00729113 0.99270887]
 [0.99818443 0.00181557]
 [0.68828648 0.31171352]
 [0.7184393  0.2815607 ]
 [0.44036853 0.55963147]
 [0.9854964  0.0145036 ]
 [0.96087859 0.03912141]
 [0.92732851 0.07267149]
 [0.73473683 0.26526317]
 [0.95536065 0.04463935]
 [0.99244554 0.00755446]
 [0.02608456 0.97391544]
 [0.63102471 0.36897529]


In [40]:
y_train

151    0
392    1
66     0
37     0
295    0
      ..
369    1
320    1
15     0
125    0
265    1
Name: Purchased, Length: 360, dtype: int64