In [1]:
import pandas as pd
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

In [2]:
# Load the data
auctions_df = pd.read_csv('eBayAuctions.csv')

# Drop missing values
auctions_df.dropna(inplace=True)

# Convert categorical variables to dummy variables
cat_predictors = ['Category', 'currency', 'Duration', 'endDay']
X_cat = pd.get_dummies(auctions_df[cat_predictors], dtype=int)

# Combine categorical and continuous predictors
X = pd.concat([X_cat, auctions_df[['sellerRating', 'ClosePrice', 'OpenPrice']]], axis=1)
y = auctions_df['Competitive?']

# Split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=1)

In [3]:
# Logistic regression with all predictors
model = LogisticRegression(max_iter=100000)
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print("Accuracy of logistic model with all predictors/features:", accuracy)

# Regularized logistic regression
model_regularized = LogisticRegression(penalty='l1', solver='liblinear')
model_regularized.fit(X_train, y_train)
y_pred_regularized = model_regularized.predict(X_valid)
accuracy_regularized = accuracy_score(y_valid, y_pred_regularized)
print("Accuracy of regularized logistic model:", accuracy_regularized)

Accuracy of logistic model with all predictors/features: 0.761723700887199
Accuracy of regularized logistic model: 0.7566539923954373


In [4]:
model_eval = sm.Logit(y_train, sm.add_constant(X_train))
result = model_eval.fit()
print(result.summary())

         Current function value: 0.500518
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:           Competitive?   No. Observations:                 1183
Model:                          Logit   Df Residuals:                     1153
Method:                           MLE   Df Model:                           29
Date:                Wed, 28 Feb 2024   Pseudo R-squ.:                  0.2757
Time:                        04:21:37   Log-Likelihood:                -592.11
converged:                      False   LL-Null:                       -817.49
Covariance Type:            nonrobust   LLR p-value:                 3.544e-77
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                             0.5292        nan        nan        nan         nan         nan
Duration

