In [2]:
# Logistic Regression with Organics.csv and statsmodels library 

In [5]:
# Organics.csv dataset is in the folder C:\Users\jason\OneDrive\Documents\MBAD 6211 LAST SEMESTER

In [11]:
#Import the os library to use file and folder methods
import os

In [13]:
# view the current working directory
os.getcwd()

'C:\\Users\\jason'

In [17]:
#change to Documents directory under Jason
os.chdir('Documents')


In [19]:
os.getcwd()

'C:\\Users\\jason\\Documents'

In [80]:
#import necessary libraries for problem solving
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [82]:
# read data from Organics.csv into a dataframe
file_path = r'C:\Users\jason\OneDrive\Documents\MBAD 6211 LAST SEMESTER\organics.csv'
data = pd.read_csv(file_path)

In [84]:
# examine the dataframe's characteristics
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         22223 non-null  int64  
 1   DemAffl    21138 non-null  float64
 2   DemAge     20715 non-null  float64
 3   DemGender  19711 non-null  object 
 4   PromClass  22223 non-null  object 
 5   PromSpend  22223 non-null  float64
 6   PromTime   21942 non-null  float64
 7   TargetBuy  22223 non-null  int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 1.4+ MB


In [86]:
# count the rows with missing values
data.isnull().sum()

ID              0
DemAffl      1085
DemAge       1508
DemGender    2512
PromClass       0
PromSpend       0
PromTime      281
TargetBuy       0
dtype: int64

In [88]:
# drop rows with missing values
data_nonull = data.dropna()
data_nonull.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17272 entries, 0 to 22221
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         17272 non-null  int64  
 1   DemAffl    17272 non-null  float64
 2   DemAge     17272 non-null  float64
 3   DemGender  17272 non-null  object 
 4   PromClass  17272 non-null  object 
 5   PromSpend  17272 non-null  float64
 6   PromTime   17272 non-null  float64
 7   TargetBuy  17272 non-null  int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 1.2+ MB


In [90]:
data_nonull = data_nonull.drop(["ID"], axis = 1)
data_nonull.head()

Unnamed: 0,DemAffl,DemAge,DemGender,PromClass,PromSpend,PromTime,TargetBuy
0,10.0,76.0,U,Gold,16000.0,4.0,0
1,4.0,49.0,U,Gold,6000.0,5.0,0
2,5.0,70.0,F,Silver,0.02,8.0,1
3,10.0,65.0,M,Tin,0.01,7.0,1
4,11.0,68.0,F,Tin,0.01,8.0,0


In [92]:
data_nonull = data_nonull.drop(["PromTime"], axis = 1)
data_nonull.head()

Unnamed: 0,DemAffl,DemAge,DemGender,PromClass,PromSpend,TargetBuy
0,10.0,76.0,U,Gold,16000.0,0
1,4.0,49.0,U,Gold,6000.0,0
2,5.0,70.0,F,Silver,0.02,1
3,10.0,65.0,M,Tin,0.01,1
4,11.0,68.0,F,Tin,0.01,0


In [94]:
# us pd.getdummies() method to create the dummy columns
# make sure to set parameter dtype = int.
data_nonull = pd.get_dummies(data_nonull, dtype = int)
data_nonull

Unnamed: 0,DemAffl,DemAge,PromSpend,TargetBuy,DemGender_F,DemGender_M,DemGender_U,PromClass_Gold,PromClass_Platinum,PromClass_Silver,PromClass_Tin
0,10.0,76.0,16000.00,0,0,0,1,1,0,0,0
1,4.0,49.0,6000.00,0,0,0,1,1,0,0,0
2,5.0,70.0,0.02,1,1,0,0,0,0,1,0
3,10.0,65.0,0.01,1,0,1,0,0,0,0,1
4,11.0,68.0,0.01,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
22216,13.0,49.0,500.00,0,0,1,0,0,0,1,0
22218,13.0,65.0,1500.00,0,1,0,0,0,0,1,0
22219,15.0,73.0,6053.06,0,0,0,1,1,0,0,0
22220,9.0,70.0,6000.00,0,1,0,0,1,0,0,0


In [96]:
# Set up the dependent and independent variables for the logistic regression
# y is an array of the target values
y = np.array(data_nonull["TargetBuy"])


# x is a dataframe of the needed independent variables
x = data_nonull[["DemAffl", "DemAge", "PromSpend", "DemGender_F", "DemGender_M", "PromClass_Gold", "PromClass_Platinum", "PromClass_Silver"]]


In [100]:
# split the dataset into training and testing
# since random sampling is used, set the random seed to a fixed value
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 44)

In [102]:
# ready to run Logistic regression model
# add constants to the independent data frames
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

In [104]:
# Fit logistic regression model and get a summary of the results
model = sm.Logit(y_train.astype(float), X_train.astype(float))
result = model.fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.449580
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                12090
Model:                          Logit   Df Residuals:                    12081
Method:                           MLE   Df Model:                            8
Date:                Thu, 08 Feb 2024   Pseudo R-squ.:                  0.2229
Time:                        21:54:42   Log-Likelihood:                -5435.4
converged:                       True   LL-Null:                       -6994.9
Covariance Type:            nonrobust   LLR p-value:                     0.000
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -2.0926      0.175    -11.940      0.000      -2.436      -1.749
DemAf

In [106]:
# Add Predictions of the test data for the confusion matrix
X_test["Predicted_Prob"] = result.predict(X_test)
X_test.head()

Unnamed: 0,const,DemAffl,DemAge,PromSpend,DemGender_F,DemGender_M,PromClass_Gold,PromClass_Platinum,PromClass_Silver,Predicted_Prob
10588,1.0,6.0,34.0,6700.0,1,0,1,0,0,0.378802
15281,1.0,6.0,59.0,1000.0,0,1,0,0,1,0.055534
8921,1.0,11.0,55.0,6000.0,1,0,1,0,0,0.402603
20175,1.0,10.0,76.0,9000.0,1,0,1,0,0,0.146069
14592,1.0,10.0,40.0,0.01,0,0,0,0,0,0.143424


In [108]:
# convert the probabilities to prediction of 1 if probility >= 0.5, 0 otherwise
predictions = (X_test["Predicted_Prob"] >= 0.5).astype(int)

In [110]:
predictions

10588    0
15281    0
8921     0
20175    0
14592    0
        ..
20050    0
21925    0
10631    0
16430    0
9579     0
Name: Predicted_Prob, Length: 5182, dtype: int32

In [112]:
# import metrics from sklearn
from sklearn import metrics

#compute confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, predictions)

# print confusion matrix
conf_matrix

array([[3601,  216],
       [ 796,  569]], dtype=int64)

In [115]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Assuming the confusion matrix is stored in the variable cm
cm = [[3601, 216], [796, 569]]

# Extracting values from the confusion matrix
TN, FP, FN, TP = cm[0][0], cm[0][1], cm[1][0], cm[1][1]

accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.8047086067155539
Precision: 0.7248407643312101
Recall: 0.41684981684981687
F1 Score: 0.5293023255813953
