#### Logistic Regression

##### Step 1: Data Preprocessing

In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [15]:
dataset = pd.read_csv("Datasets/Social_Network_Ads.csv")
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [16]:
dataset.shape

(400, 5)

In [17]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [18]:
dataset.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


##### Step 2: Encoding categorical variables

In [19]:
dataset["Gender"].value_counts()

Female    204
Male      196
Name: Gender, dtype: int64

In [20]:
dataset_new = pd.get_dummies(dataset, columns=["Gender"])
dataset_new.drop("Gender_Male", axis=1, inplace=True)
dataset_new.head()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Female
0,15624510,19,19000,0,0
1,15810944,35,20000,0,0
2,15668575,26,43000,0,1
3,15603246,27,57000,0,1
4,15804002,19,76000,0,0


##### Step 3: Train-Test Split

In [28]:
X = dataset_new.iloc[:,[1,2,4]].values
Y = dataset_new.iloc[:,[3]].values

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

##### Step 4: Standardizing/Normalizing the features

In [31]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

##### Step 5: Fitting the model

In [43]:
model = LogisticRegression(n_jobs=-1)
model.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


0.8875

##### Step 6: Checking Accuracy of the model

In [35]:
model.score(X_test, Y_test)

0.8875

##### Step 7: Predicting the result

In [45]:
Y_pred = model.predict(X_test)
Y_pred

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0], dtype=int64)

In [49]:
cm = confusion_matrix(Y_test,Y_pred)
cm

array([[50,  2],
       [ 7, 21]], dtype=int64)