# 此代码实现基于Data数据的逻辑回归

In [15]:
import pandas as pd
data = pd.read_excel("data.xlsx")

In [95]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [17]:
df = data.copy()
# Converting all columns except 'bid', 'pid', 'solverid', and 'description' to numeric
columns_to_convert = df.columns.difference(['bid', 'pid', 'solverid', 'description'])

# Applying conversion to numeric, with errors='coerce' to handle non-numeric values
df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

# Checking for any non-numeric values that were converted to NaN
null_values = df[columns_to_convert].isnull().sum()


In [23]:
# Specify the columns to drop
columns_to_drop = ['bid', 'pid', 'solverid', 'description', 'price', 'seekerid', 'time_diff']

# Drop the specified columns
df = df.drop(columns=columns_to_drop)



In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62129 entries, 0 to 62128
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   DDelivery                   62129 non-null  int64  
 1   competition                 62129 non-null  int64  
 2   std_award_status            62129 non-null  int64  
 3   std_price                   62129 non-null  float64
 4   word                        62129 non-null  int64  
 5   confidence_prediction       62129 non-null  int64  
 6   skill_prediction            62129 non-null  int64  
 7   standardization_prediction  62129 non-null  int64  
 8   experience_prediction       62129 non-null  int64  
 9   politeness_prediction       62129 non-null  int64  
 10  customization_prediction    62129 non-null  int64  
 11  time_diff_dummy             62129 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 5.7 MB


In [29]:
df.sample(4)

Unnamed: 0,DDelivery,competition,std_award_status,std_price,word,confidence_prediction,skill_prediction,standardization_prediction,experience_prediction,politeness_prediction,customization_prediction,time_diff_dummy
57639,7,87,0,15000.0,120,1,1,1,1,1,1,0
43426,40,67,0,1500.0,24,1,0,1,0,0,0,0
31306,7,20,0,1125.0,92,0,1,1,1,0,0,1
33541,7,42,0,15.44175,163,1,1,1,1,1,0,1


In [39]:
df.to_excel('data_cleaned.xlsx', index=False)


# 主模型

In [36]:
import statsmodels.api as sm# 定义自变量和因变量
X = df[['confidence_prediction', 'skill_prediction', 'experience_prediction', 'politeness_prediction', 
        'customization_prediction', 'standardization_prediction', 'std_price', 'DDelivery', 'word', 'time_diff_dummy']]
y = df['std_award_status']

# 添加常数项
X = sm.add_constant(X)

# 逻辑回归模型
model = sm.Logit(y, X)
result = model.fit()

# 展示模型信息
result.summary()

Optimization terminated successfully.
         Current function value: 0.051533
         Iterations 10


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,62129.0
Model:,Logit,Df Residuals:,62118.0
Method:,MLE,Df Model:,10.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.03043
Time:,13:39:24,Log-Likelihood:,-3201.7
converged:,True,LL-Null:,-3302.2
Covariance Type:,nonrobust,LLR p-value:,1.009e-37

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.0215,0.231,-13.085,0.000,-3.474,-2.569
confidence_prediction,-0.0946,0.094,-1.006,0.315,-0.279,0.090
skill_prediction,-0.0539,0.099,-0.545,0.586,-0.248,0.140
experience_prediction,-0.0765,0.130,-0.586,0.558,-0.332,0.179
politeness_prediction,-0.0540,0.092,-0.583,0.560,-0.235,0.127
customization_prediction,-0.3030,0.112,-2.716,0.007,-0.522,-0.084
standardization_prediction,-0.4439,0.255,-1.739,0.082,-0.944,0.056
std_price,1.128e-07,5.31e-07,0.212,0.832,-9.27e-07,1.15e-06
DDelivery,-0.0324,0.005,-6.961,0.000,-0.042,-0.023


In [71]:
import statsmodels.api as sm# 定义自变量和因变量
X = df[[ 
        'customization_prediction', 'standardization_prediction', 'std_price']]
y = df['std_award_status']

# 添加常数项
X = sm.add_constant(X)

# 逻辑回归模型
model = sm.Logit(y, X)
result = model.fit()

# 展示模型信息
result.summary()

Optimization terminated successfully.
         Current function value: 0.052699
         Iterations 9


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,62129.0
Model:,Logit,Df Residuals:,62125.0
Method:,MLE,Df Model:,3.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.008487
Time:,15:15:20,Log-Likelihood:,-3274.1
converged:,True,LL-Null:,-3302.2
Covariance Type:,nonrobust,LLR p-value:,4.099e-12

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.4408,0.227,-15.140,0.000,-3.886,-2.995
customization_prediction,-0.6155,0.107,-5.768,0.000,-0.825,-0.406
standardization_prediction,-1.0907,0.231,-4.719,0.000,-1.544,-0.638
std_price,-2.59e-07,6.82e-07,-0.380,0.704,-1.6e-06,1.08e-06


In [77]:
import statsmodels.api as sm# 定义自变量和因变量
X = df[[ 
        'confidence_prediction', 'skill_prediction', 'experience_prediction', 'politeness_prediction']]
y = df['std_award_status']

# 添加常数项
X = sm.add_constant(X)

# 逻辑回归模型
model = sm.Logit(y, X)
result = model.fit()

# 展示模型信息
result.summary()


Optimization terminated successfully.
         Current function value: 0.052779
         Iterations 9


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,62129.0
Model:,Logit,Df Residuals:,62124.0
Method:,MLE,Df Model:,4.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.006979
Time:,15:19:18,Log-Likelihood:,-3279.1
converged:,True,LL-Null:,-3302.2
Covariance Type:,nonrobust,LLR p-value:,2.358e-09

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.9595,0.115,-34.508,0.000,-4.184,-3.735
confidence_prediction,-0.3206,0.087,-3.665,0.000,-0.492,-0.149
skill_prediction,-0.1502,0.097,-1.554,0.120,-0.340,0.039
experience_prediction,-0.3700,0.122,-3.029,0.002,-0.609,-0.131
politeness_prediction,-0.1652,0.089,-1.860,0.063,-0.339,0.009


# 交互作用与非线性关系分析

In [41]:
df_full = df.copy()

In [97]:
# Adding interaction terms for customization with confidence and DDelivery in the full dataset
df_full['customization_confidence'] = df_full['customization_prediction'] * df_full['confidence_prediction']
df_full['customization_delivery'] = df_full['customization_prediction'] * df_full['DDelivery']

# Define the independent variables including interaction terms
X_full = df_full[['confidence_prediction', 'skill_prediction', 'experience_prediction', 'politeness_prediction', 
                  'customization_prediction', 'standardization_prediction', 'std_price', 'DDelivery', 
                  'word', 'time_diff_dummy', 'customization_confidence', 'customization_delivery']]
y_full = df_full['std_award_status']

# 计算 VIF
vif_data = pd.DataFrame()
vif_data['Variable'] = X_full.columns
vif_data['VIF'] = [variance_inflation_factor(X_full.values, i) for i in range(X_full.shape[1])]
# 显示 VIF
print(vif_data)

# Add a constant to the model
X_full = sm.add_constant(X_full)

# Fit the logistic regression model with interaction terms
model_full = sm.Logit(y_full, X_full)
result_full = model_full.fit()

# Display the summary of the interaction model
result_full.summary()


                      Variable        VIF
0        confidence_prediction   4.514880
1             skill_prediction   3.338292
2        experience_prediction  11.188935
3        politeness_prediction   3.446916
4     customization_prediction   5.538293
5   standardization_prediction  13.929405
6                    std_price   1.013236
7                    DDelivery   1.530431
8                         word   6.431459
9              time_diff_dummy   1.283947
10    customization_confidence   4.369765
11      customization_delivery   2.286825
Optimization terminated successfully.
         Current function value: 0.051447
         Iterations 10


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,62129.0
Model:,Logit,Df Residuals:,62116.0
Method:,MLE,Df Model:,12.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.03203
Time:,15:40:54,Log-Likelihood:,-3196.4
converged:,True,LL-Null:,-3302.2
Covariance Type:,nonrobust,LLR p-value:,1.329e-38

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.0103,0.233,-12.941,0.000,-3.466,-2.554
confidence_prediction,0.0182,0.103,0.177,0.860,-0.183,0.220
skill_prediction,-0.0453,0.099,-0.457,0.648,-0.240,0.149
experience_prediction,-0.0869,0.130,-0.666,0.505,-0.343,0.169
politeness_prediction,-0.0543,0.093,-0.587,0.557,-0.236,0.127
customization_prediction,-0.2045,0.193,-1.059,0.290,-0.583,0.174
standardization_prediction,-0.4534,0.255,-1.777,0.076,-0.953,0.047
std_price,9.977e-08,5.4e-07,0.185,0.854,-9.59e-07,1.16e-06
DDelivery,-0.0387,0.006,-6.616,0.000,-0.050,-0.027


In [105]:
# Adding interaction terms for customization with confidence and DDelivery in the full dataset
df_full['customization_confidence'] = df_full['customization_prediction'] * df_full['confidence_prediction']
df_full['customization_delivery'] = df_full['customization_prediction'] * df_full['DDelivery']

# Define the independent variables including interaction terms
X_full = df_full[[ 'confidence_prediction',
                  'customization_prediction', 'standardization_prediction',  'customization_confidence']]
y_full = df_full['std_award_status']
# 计算 VIF
vif_data = pd.DataFrame()
vif_data['Variable'] = X_full.columns
vif_data['VIF'] = [variance_inflation_factor(X_full.values, i) for i in range(X_full.shape[1])]
# 显示 VIF
print(vif_data)
# Add a constant to the model
X_full = sm.add_constant(X_full)

# Fit the logistic regression model with interaction terms
model_full = sm.Logit(y_full, X_full)
result_full = model_full.fit()

# Display the summary of the interaction model
result_full.summary()


                     Variable       VIF
0       confidence_prediction  3.905587
1    customization_prediction  4.221570
2  standardization_prediction  3.788274
3    customization_confidence  4.338877
Optimization terminated successfully.
         Current function value: 0.052523
         Iterations 9


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,62129.0
Model:,Logit,Df Residuals:,62124.0
Method:,MLE,Df Model:,4.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.01179
Time:,15:44:51,Log-Likelihood:,-3263.2
converged:,True,LL-Null:,-3302.2
Covariance Type:,nonrobust,LLR p-value:,4.912e-16

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.4688,0.228,-15.230,0.000,-3.915,-3.022
confidence_prediction,-0.2615,0.095,-2.766,0.006,-0.447,-0.076
customization_prediction,-0.3322,0.152,-2.186,0.029,-0.630,-0.034
standardization_prediction,-0.9078,0.236,-3.851,0.000,-1.370,-0.446
customization_confidence,-0.4838,0.214,-2.265,0.024,-0.902,-0.065


## BETTER RESULT

In [259]:
# Adding interaction terms for customization with confidence and DDelivery in the full dataset
df_full['customization_confidence'] = df_full['customization_prediction'] * df_full['confidence_prediction']
df_full['customization_delivery'] = df_full['customization_prediction'] * df_full['DDelivery']

# Define the independent variables including interaction terms
X_full = df_full[[ 'confidence_prediction',
                  'customization_prediction', 'standardization_prediction', 'customization_confidence' ,'DDelivery', 
                  'word', 'time_diff_dummy']]
y_full = df_full['std_award_status']
# 计算 VIF
vif_data = pd.DataFrame()
vif_data['Variable'] = X_full.columns
vif_data['VIF'] = [variance_inflation_factor(X_full.values, i) for i in range(X_full.shape[1])]
# 显示 VIF
print(vif_data)
# Add a constant to the model
X_full = sm.add_constant(X_full)

# Fit the logistic regression model with interaction terms
model_full = sm.Logit(y_full, X_full)
result_full = model_full.fit()

# Display the summary of the interaction model
result_full.summary()


                     Variable       VIF
0       confidence_prediction  4.436141
1    customization_prediction  4.391785
2  standardization_prediction  6.018752
3    customization_confidence  4.348176
4                   DDelivery  1.315623
5                        word  5.933216
6             time_diff_dummy  1.280998
Optimization terminated successfully.
         Current function value: 0.051488
         Iterations 10


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,62129.0
Model:,Logit,Df Residuals:,62121.0
Method:,MLE,Df Model:,7.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.03127
Time:,18:11:16,Log-Likelihood:,-3198.9
converged:,True,LL-Null:,-3302.2
Covariance Type:,nonrobust,LLR p-value:,4.787999999999999e-41

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.0593,0.231,-13.227,0.000,-3.513,-2.606
confidence_prediction,-0.0006,0.102,-0.005,0.996,-0.200,0.199
customization_prediction,0.0066,0.155,0.042,0.966,-0.298,0.311
standardization_prediction,-0.5565,0.239,-2.331,0.020,-1.025,-0.089
customization_confidence,-0.5589,0.214,-2.613,0.009,-0.978,-0.140
DDelivery,-0.0327,0.005,-7.025,0.000,-0.042,-0.024
word,-0.0062,0.001,-5.777,0.000,-0.008,-0.004
time_diff_dummy,-0.1997,0.110,-1.820,0.069,-0.415,0.015


In [281]:
# Adding interaction terms for customization with confidence and DDelivery in the full dataset
df_full['scaled_price'] = df_full['std_price'] / (df_full['competition'] + 1)  # adding 1 to avoid division by zero

df_full['customization_confidence'] = df_full['customization_prediction'] * df_full['confidence_prediction']
df_full['customization_politeness'] = df_full['customization_prediction'] * df_full['politeness_prediction']
df_full['customization_skill'] = df_full['customization_prediction'] * df_full['skill_prediction']

df_full['customization_delivery'] = df_full['customization_prediction'] * df_full['DDelivery']
df_full['customization_price'] = df_full['customization_prediction'] * df_full['scaled_price']

#强烈多重共线df_full['standardization_price'] = df_full['standardization_prediction'] * df_full['scaled_price']

# Define the independent variables including interaction terms
X_full = df_full[['scaled_price', 'confidence_prediction','politeness_prediction',
                  'customization_prediction','skill_prediction', 
                  'customization_confidence' ,'customization_politeness','customization_skill',
                 
                  'DDelivery', 
                  'word', 'time_diff_dummy']]
y_full = df_full['std_award_status']
# 计算 VIF
vif_data = pd.DataFrame()
vif_data['Variable'] = X_full.columns
vif_data['VIF'] = [variance_inflation_factor(X_full.values, i) for i in range(X_full.shape[1])]
# 显示 VIF
print(vif_data)
# Add a constant to the model
X_full = sm.add_constant(X_full)

# Fit the logistic regression model with interaction terms
model_full = sm.Logit(y_full, X_full)
result_full = model_full.fit()

# Display the summary of the interaction model
result_full.summary()


                    Variable       VIF
0               scaled_price  1.003679
1      confidence_prediction  4.263962
2      politeness_prediction  3.229858
3   customization_prediction  8.780771
4           skill_prediction  3.556644
5   customization_confidence  4.444963
6   customization_politeness  5.126652
7        customization_skill  3.617934
8                  DDelivery  1.298834
9                       word  5.758540
10           time_diff_dummy  1.257750
Optimization terminated successfully.
         Current function value: 0.051421
         Iterations 10


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,62129.0
Model:,Logit,Df Residuals:,62117.0
Method:,MLE,Df Model:,11.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.03253
Time:,18:30:27,Log-Likelihood:,-3194.7
converged:,True,LL-Null:,-3302.2
Covariance Type:,nonrobust,LLR p-value:,6.116e-40

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.4790,0.115,-30.177,0.000,-3.705,-3.253
scaled_price,4.894e-06,1.79e-06,2.733,0.006,1.38e-06,8.4e-06
confidence_prediction,0.0066,0.101,0.065,0.948,-0.192,0.205
politeness_prediction,-0.0501,0.099,-0.508,0.612,-0.244,0.143
customization_prediction,-0.0386,0.258,-0.150,0.881,-0.544,0.467
skill_prediction,-0.1957,0.100,-1.959,0.050,-0.391,0.000
customization_confidence,-0.6663,0.220,-3.033,0.002,-1.097,-0.236
customization_politeness,-0.2704,0.232,-1.166,0.243,-0.725,0.184
customization_skill,0.5124,0.226,2.263,0.024,0.069,0.956


# 拆分策略

In [221]:
# Adding interaction terms for customization with confidence and DDelivery in the full dataset
#df_full['skill_confidence'] = df_full['skill_prediction'] * df_full['confidence_prediction']
##df_full['experience_confidence'] = df_full['experience_prediction'] * df_full['confidence_prediction']
df_full['politeness_confidence'] = df_full['politeness_prediction'] * df_full['confidence_prediction']
##df_full['skill_experience'] = df_full['skill_prediction'] * df_full['experience_prediction']
df_full['skill_politeness'] = df_full['skill_prediction'] * df_full['politeness_prediction']
##df_full['experience_politeness'] = df_full['experience_prediction'] * df_full['politeness_prediction']


# Define the independent variables including interaction terms
X_full = df_full[[ 'skill_prediction', 'politeness_prediction',
                
                  'skill_politeness',
                   'std_price', 'DDelivery', 
                  'word', 'time_diff_dummy']]
y_full = df_full['std_award_status']
y_full = df_full['std_award_status']
# 计算 VIF
vif_data = pd.DataFrame()
vif_data['Variable'] = X_full.columns
vif_data['VIF'] = [variance_inflation_factor(X_full.values, i) for i in range(X_full.shape[1])]
# 显示 VIF
print(vif_data)
# Add a constant to the model
X_full = sm.add_constant(X_full)

# Fit the logistic regression model with interaction terms
model_full = sm.Logit(y_full, X_full)
result_full = model_full.fit()

# Display the summary of the interaction model
result_full.summary()


                Variable       VIF
0       skill_prediction  5.099850
1  politeness_prediction  4.222310
2       skill_politeness  4.972968
3              std_price  1.012572
4              DDelivery  1.304536
5                   word  4.421753
6        time_diff_dummy  1.262006
Optimization terminated successfully.
         Current function value: 0.051621
         Iterations 10


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,62129.0
Model:,Logit,Df Residuals:,62121.0
Method:,MLE,Df Model:,7.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.02878
Time:,17:44:47,Log-Likelihood:,-3207.1
converged:,True,LL-Null:,-3302.2
Covariance Type:,nonrobust,LLR p-value:,1.471e-37

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.3611,0.125,-26.851,0.000,-3.606,-3.116
skill_prediction,-0.2464,0.148,-1.659,0.097,-0.537,0.045
politeness_prediction,-0.2376,0.133,-1.783,0.075,-0.499,0.024
skill_politeness,0.2307,0.177,1.300,0.193,-0.117,0.578
std_price,9.708e-08,5.39e-07,0.180,0.857,-9.59e-07,1.15e-06
DDelivery,-0.0333,0.005,-7.124,0.000,-0.042,-0.024
word,-0.0073,0.001,-7.170,0.000,-0.009,-0.005
time_diff_dummy,-0.2186,0.110,-1.992,0.046,-0.434,-0.004


# experience

In [135]:
# Adding interaction terms for experience and confidence, politeness and customization
df_full['experience_confidence'] = df_full['experience_prediction'] * df_full['confidence_prediction']
df_full['politeness_customization'] = df_full['politeness_prediction'] * df_full['customization_prediction']


# Adding non-linear terms for std_price and DDelivery
df_full['std_price_squared'] = df_full['std_price'] ** 2

# Scaled price: std_price / competition (normalizing by competition level)
df_full['scaled_price'] = df_full['std_price'] / (df_full['competition'] + 1)  # adding 1 to avoid division by zero

# Adding the competition variable and interaction with std_price
df_full['competition_price'] = df_full['competition'] * df_full['std_price_squared']
# Define the independent variables including these new terms
X_extended = df_full[['confidence_prediction', 'skill_prediction', 'experience_prediction', 'politeness_prediction', 
                      'customization_prediction', 'standardization_prediction', 'DDelivery', 'word', 
                      'time_diff_dummy', 'customization_confidence', 'customization_delivery', 
                      'experience_confidence', 'politeness_customization', 'competition_price', 
                      'std_price_squared',  'scaled_price']]
y_extended = df_full['std_award_status']
# 计算 VIF
vif_data = pd.DataFrame()
vif_data['Variable'] = X_extended.columns
vif_data['VIF'] = [variance_inflation_factor(X_extended.values, i) for i in range(X_extended.shape[1])]
# 显示 VIF
print(vif_data)

# Add a constant to the model
X_extended = sm.add_constant(X_extended)

# Fit the logistic regression model with extended terms
model_extended = sm.Logit(y_extended, X_extended)
result_extended = model_extended.fit()

# Display the summary of the extended model
result_extended.summary()


                      Variable        VIF
0        confidence_prediction   9.417810
1             skill_prediction   1.296864
2        experience_prediction   2.082813
3        politeness_prediction   1.441224
4     customization_prediction   6.721579
5   standardization_prediction   0.214846
6                    DDelivery   1.178858
7                         word   1.414143
8              time_diff_dummy   1.003785
9     customization_confidence   3.492793
10      customization_delivery   1.959605
11       experience_confidence  10.599707
12    politeness_customization   4.219874
13           competition_price   4.547068
14           std_price_squared   6.310091
15                scaled_price   2.394488
Optimization terminated successfully.
         Current function value: 0.051334
         Iterations 16


  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,62129.0
Model:,Logit,Df Residuals:,62112.0
Method:,MLE,Df Model:,16.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.03417
Time:,16:27:41,Log-Likelihood:,-3189.3
converged:,True,LL-Null:,-3302.2
Covariance Type:,nonrobust,LLR p-value:,4.8699999999999995e-39

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.0344,0.234,-12.963,0.000,-3.493,-2.576
confidence_prediction,-0.1628,0.242,-0.672,0.502,-0.638,0.312
skill_prediction,-0.0502,0.099,-0.506,0.613,-0.245,0.144
experience_prediction,-0.1554,0.152,-1.023,0.306,-0.453,0.142
politeness_prediction,-0.0033,0.101,-0.032,0.974,-0.202,0.195
customization_prediction,0.0570,0.251,0.227,0.820,-0.434,0.549
standardization_prediction,-0.4137,0.260,-1.593,0.111,-0.923,0.095
DDelivery,-0.0380,0.006,-6.488,0.000,-0.050,-0.027
word,-0.0058,0.001,-5.129,0.000,-0.008,-0.004


In [157]:
# Adding interaction terms for experience and confidence, politeness and customization
df_full['experience_confidence'] = df_full['experience_prediction'] * df_full['confidence_prediction']
df_full['politeness_customization'] = df_full['politeness_prediction'] * df_full['customization_prediction']


# Adding non-linear terms for std_price and DDelivery
df_full['std_price_squared'] = df_full['std_price'] ** 2

# Scaled price: std_price / competition (normalizing by competition level)
df_full['scaled_price'] = df_full['std_price'] / (df_full['competition'] + 1)  # adding 1 to avoid division by zero

# Adding the competition variable and interaction with std_price
df_full['competition_price'] = df_full['competition'] * df_full['scaled_price']
# Define the independent variables including these new terms
X_extended = df_full[['confidence_prediction',  'experience_prediction', 'politeness_prediction', 'customization_prediction',
                      'DDelivery', 'word', 
                      'competition',
                      'experience_confidence', 'politeness_customization', 'competition_price', 
                      'std_price_squared',  'scaled_price']]
y_extended = df_full['std_award_status']
# 计算 VIF
vif_data = pd.DataFrame()
vif_data['Variable'] = X_extended.columns
vif_data['VIF'] = [variance_inflation_factor(X_extended.values, i) for i in range(X_extended.shape[1])]
# 显示 VIF
print(vif_data)

# Add a constant to the model
X_extended = sm.add_constant(X_extended)

# Fit the logistic regression model with extended terms
model_extended = sm.Logit(y_extended, X_extended)
result_extended = model_extended.fit()

# Display the summary of the extended model
result_extended.summary()


                    Variable       VIF
0      confidence_prediction  6.960971
1      experience_prediction  0.887755
2      politeness_prediction  1.190528
3   customization_prediction  3.747774
4                  DDelivery  1.038131
5                       word  1.361179
6                competition  0.990055
7      experience_confidence  8.024843
8   politeness_customization  4.074952
9          competition_price  3.308285
10         std_price_squared  2.797323
11              scaled_price  3.302745
Optimization terminated successfully.
         Current function value: 0.049163
         Iterations 11


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,62129.0
Model:,Logit,Df Residuals:,62116.0
Method:,MLE,Df Model:,12.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.07501
Time:,17:14:24,Log-Likelihood:,-3054.5
converged:,True,LL-Null:,-3302.2
Covariance Type:,nonrobust,LLR p-value:,2.126e-98

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.7160,0.137,-19.815,0.000,-2.985,-2.447
confidence_prediction,-0.2489,0.236,-1.053,0.292,-0.712,0.214
experience_prediction,-0.1864,0.136,-1.367,0.172,-0.454,0.081
politeness_prediction,0.0405,0.098,0.413,0.679,-0.152,0.233
customization_prediction,0.0948,0.194,0.488,0.626,-0.286,0.475
DDelivery,-0.0155,0.004,-3.489,0.000,-0.024,-0.007
word,-0.0045,0.001,-4.137,0.000,-0.007,-0.002
competition,-0.0347,0.002,-14.072,0.000,-0.040,-0.030
experience_confidence,0.2224,0.252,0.883,0.377,-0.271,0.716


In [50]:
# Create bins for std_price and competition
df_full['price_group'] = pd.qcut(df_full['std_price'], 3, labels=['low_price', 'medium_price', 'high_price'])
df_full['competition_group'] = pd.qcut(df_full['competition'], 2, labels=['low_competition', 'high_competition'])

# Function to run logistic regression for a given subset of data
def run_logit_for_subset(subset_df):
    # Define the independent variables
    X_subset = subset_df[['confidence_prediction', 'skill_prediction', 'experience_prediction', 'politeness_prediction', 
                          'customization_prediction', 'standardization_prediction', 'DDelivery', 'word', 
                          'time_diff_dummy', 'customization_confidence', 'customization_delivery']]
    y_subset = subset_df['std_award_status']
    
    # Add a constant to the model
    X_subset = sm.add_constant(X_subset)
    
    # Fit the logistic regression model
    model = sm.Logit(y_subset, X_subset)
    result = model.fit(disp=False)
    return result

# Split data by price_group and competition_group, then run the model for each subset
results = {}

for price_group in df_full['price_group'].unique():
    for competition_group in df_full['competition_group'].unique():
        subset_df = df_full[(df_full['price_group'] == price_group) & (df_full['competition_group'] == competition_group)]
        if not subset_df.empty:
            result = run_logit_for_subset(subset_df)
            results[(price_group, competition_group)] = result.summary()

# Display the summary for each subset analysis
results




{('medium_price',
  'low_competition'): <class 'statsmodels.iolib.summary.Summary'>
 """
                            Logit Regression Results                           
 Dep. Variable:       std_award_status   No. Observations:                12672
 Model:                          Logit   Df Residuals:                    12659
 Method:                           MLE   Df Model:                           12
 Date:                Tue, 24 Sep 2024   Pseudo R-squ.:                 0.02255
 Time:                        14:06:50   Log-Likelihood:                -826.33
 converged:                       True   LL-Null:                       -845.40
 Covariance Type:            nonrobust   LLR p-value:                 0.0001464
                                  coef    std err          z      P>|z|      [0.025      0.975]
 ----------------------------------------------------------------------------------------------
 const                         -2.8787      0.405     -7.115      0.000      -3

# focus on specific strategies in high-competition settings

In [52]:
# Filter data for high-competition settings
high_competition_df = df_full[df_full['competition_group'] == 'high_competition']

# Define the independent variables for high-competition settings
X_high_competition = high_competition_df[['confidence_prediction', 'skill_prediction', 'experience_prediction', 
                                          'politeness_prediction', 'customization_prediction', 
                                          'standardization_prediction', 'std_price', 'DDelivery', 'word', 
                                          'time_diff_dummy', 'customization_confidence', 'customization_delivery']]

# Define the dependent variable
y_high_competition = high_competition_df['std_award_status']

# Add a constant to the model
X_high_competition = sm.add_constant(X_high_competition)

# Fit the logistic regression model for high-competition data
model_high_competition = sm.Logit(y_high_competition, X_high_competition)
result_high_competition = model_high_competition.fit()

# Display the summary of the high-competition model
result_high_competition.summary()


Optimization terminated successfully.
         Current function value: 0.023670
         Iterations 12


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,30626.0
Model:,Logit,Df Residuals:,30613.0
Method:,MLE,Df Model:,12.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.01322
Time:,14:09:23,Log-Likelihood:,-724.92
converged:,True,LL-Null:,-734.63
Covariance Type:,nonrobust,LLR p-value:,0.07881

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.3514,0.723,-6.018,0.000,-5.769,-2.934
confidence_prediction,0.0642,0.244,0.264,0.792,-0.413,0.542
skill_prediction,-0.0641,0.221,-0.289,0.772,-0.498,0.370
experience_prediction,0.0017,0.323,0.005,0.996,-0.631,0.634
politeness_prediction,0.0723,0.219,0.330,0.742,-0.358,0.502
customization_prediction,-0.2486,0.400,-0.622,0.534,-1.033,0.535
standardization_prediction,-0.5925,0.773,-0.766,0.444,-2.108,0.923
std_price,-2.371e-05,2.27e-05,-1.044,0.297,-6.82e-05,2.08e-05
DDelivery,-0.0052,0.007,-0.697,0.486,-0.020,0.009


# examine only customization impact

In [54]:
# Define the independent variables focusing on customization and control variables
X_customization_only = high_competition_df[['customization_prediction', 'customization_confidence', 
                                            'customization_delivery', 'std_price', 'DDelivery', 'word']]

# Define the dependent variable
y_customization_only = high_competition_df['std_award_status']

# Add a constant to the model
X_customization_only = sm.add_constant(X_customization_only)

# Fit the logistic regression model for high-competition data, focusing on customization
model_customization_only = sm.Logit(y_customization_only, X_customization_only)
result_customization_only = model_customization_only.fit()

# Display the summary of the customization-only model
result_customization_only.summary()


Optimization terminated successfully.
         Current function value: 0.023683
         Iterations 12


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,30626.0
Model:,Logit,Df Residuals:,30619.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.01268
Time:,14:09:29,Log-Likelihood:,-725.31
converged:,True,LL-Null:,-734.63
Covariance Type:,nonrobust,LLR p-value:,0.004827

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.8614,0.231,-21.063,0.000,-5.314,-4.409
customization_prediction,-0.2694,0.367,-0.734,0.463,-0.989,0.450
customization_confidence,-0.3704,0.397,-0.932,0.351,-1.149,0.408
customization_delivery,0.0084,0.010,0.874,0.382,-0.010,0.027
std_price,-2.365e-05,2.26e-05,-1.048,0.294,-6.79e-05,2.06e-05
DDelivery,-0.0055,0.007,-0.739,0.460,-0.020,0.009
word,-0.0053,0.002,-2.459,0.014,-0.010,-0.001


# explore interactions with price

In [57]:
# Create interaction terms between customization-related variables and price
high_competition_df['customization_price'] = high_competition_df['customization_prediction'] * high_competition_df['std_price']
high_competition_df['customization_confidence_price'] = high_competition_df['customization_confidence'] * high_competition_df['std_price']

# Define the independent variables focusing on customization, price, and their interactions
X_customization_price = high_competition_df[['customization_prediction', 'customization_confidence', 
                                             'customization_delivery', 'std_price', 'DDelivery', 'word',
                                             'customization_price', 'customization_confidence_price']]

# Define the dependent variable
y_customization_price = high_competition_df['std_award_status']

# Add a constant to the model
X_customization_price = sm.add_constant(X_customization_price)

# Fit the logistic regression model with interactions between customization and price
model_customization_price = sm.Logit(y_customization_price, X_customization_price)
result_customization_price = model_customization_price.fit()

# Display the summary of the customization-price interaction model
result_customization_price.summary()


Optimization terminated successfully.
         Current function value: 0.023466
         Iterations 15


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_competition_df['customization_price'] = high_competition_df['customization_prediction'] * high_competition_df['std_price']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_competition_df['customization_confidence_price'] = high_competition_df['customization_confidence'] * high_competition_df['std_price']
  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,30626.0
Model:,Logit,Df Residuals:,30617.0
Method:,MLE,Df Model:,8.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.02172
Time:,14:11:04,Log-Likelihood:,-718.67
converged:,True,LL-Null:,-734.63
Covariance Type:,nonrobust,LLR p-value:,9.683e-05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.8517,0.231,-21.032,0.000,-5.304,-4.400
customization_prediction,-0.4080,0.362,-1.128,0.259,-1.117,0.301
customization_confidence,0.3067,0.434,0.706,0.480,-0.545,1.158
customization_delivery,0.0062,0.009,0.698,0.485,-0.011,0.024
std_price,-0.0001,6.6e-05,-1.529,0.126,-0.000,2.84e-05
DDelivery,-0.0020,0.008,-0.262,0.793,-0.017,0.013
word,-0.0051,0.002,-2.354,0.019,-0.009,-0.001
customization_price,0.0001,6.63e-05,1.527,0.127,-2.87e-05,0.000
customization_confidence_price,-0.0009,0.000,-1.956,0.050,-0.002,1.83e-06


# How does price impact low competition

In [61]:
# Filter data for low-competition settings
low_competition_df = df_full[df_full['competition_group'] == 'low_competition']

# Define the independent variables for low-competition settings
X_low_competition = low_competition_df[['customization_prediction', 'customization_confidence', 
                                        'customization_delivery', 'std_price', 'DDelivery', 'word']]

# Define the dependent variable
y_low_competition = low_competition_df['std_award_status']

# Add a constant to the model
X_low_competition = sm.add_constant(X_low_competition)

# Fit the logistic regression model for low-competition data
model_low_competition = sm.Logit(y_low_competition, X_low_competition)
result_low_competition = model_low_competition.fit()

# Display the summary of the low-competition model
result_low_competition.summary()


Optimization terminated successfully.
         Current function value: 0.075866
         Iterations 10


0,1,2,3
Dep. Variable:,std_award_status,No. Observations:,31503.0
Model:,Logit,Df Residuals:,31496.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.02498
Time:,14:15:02,Log-Likelihood:,-2390.0
converged:,True,LL-Null:,-2451.3
Covariance Type:,nonrobust,LLR p-value:,4.955e-24

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.2741,0.105,-31.113,0.000,-3.480,-3.068
customization_prediction,-0.0342,0.211,-0.163,0.871,-0.447,0.379
customization_confidence,-0.6323,0.226,-2.798,0.005,-1.075,-0.189
customization_delivery,0.0174,0.014,1.253,0.210,-0.010,0.045
std_price,-3.17e-08,5.53e-07,-0.057,0.954,-1.12e-06,1.05e-06
DDelivery,-0.0454,0.008,-5.530,0.000,-0.062,-0.029
word,-0.0052,0.001,-4.758,0.000,-0.007,-0.003


# 重新定义高竞争和低竞争

In [318]:

# Recalculate high and low competition based on the 75th percentile of the 'competition' variable
value_75_percentile = df_full['competition'].quantile(0.75)

# Reassign high and low competition groups based on the 75th percentile
df_full['competition_group_new'] = df_full['competition'].apply(lambda x: 'high_competition' if x > value_75_percentile else 'low_competition')

# Display the distribution of the new competition group classification
df_full['competition_group_new'].value_counts()


competition_group_new
low_competition     46643
high_competition    15486
Name: count, dtype: int64

In [328]:
# Filter data for the new high-competition and low-competition settings
high_competition_new_df = df_full[df_full['competition_group_new'] == 'high_competition']
low_competition_new_df = df_full[df_full['competition_group_new'] == 'low_competition']

# Define the independent variables focusing on customization and control variables for both groups
X_high_customization = high_competition_new_df[['scaled_price', 'confidence_prediction','politeness_prediction',
                  'customization_prediction','skill_prediction', 
                  'customization_confidence' ,'customization_politeness','customization_skill',
                 
                  'DDelivery', 
                  'word']]
# 计算 VIF
vif_data = pd.DataFrame()
vif_data['Variable'] = X_high_customization.columns
vif_data['VIF'] = [variance_inflation_factor(X_high_customization.values, i) for i in range(X_high_customization.shape[1])]
# 显示 VIF
print(vif_data)

X_low_customization = low_competition_new_df[['scaled_price', 'confidence_prediction','politeness_prediction',
                  'customization_prediction','skill_prediction', 
                  'customization_confidence' ,'customization_politeness','customization_skill',
                 
                  'DDelivery', 
                  'word']]
# 计算 VIF
vif_data = pd.DataFrame()
vif_data['Variable'] = X_low_customization.columns
vif_data['VIF'] = [variance_inflation_factor(X_low_customization.values, i) for i in range(X_low_customization.shape[1])]
# 显示 VIF
print(vif_data)
# Define the dependent variable
y_high_customization = high_competition_new_df['std_award_status']
y_low_customization = low_competition_new_df['std_award_status']

# Add a constant to the models
X_high_customization = sm.add_constant(X_high_customization)
X_low_customization = sm.add_constant(X_low_customization)

# Fit logistic regression models for high and low competition data
model_high_customization = sm.Logit(y_high_customization, X_high_customization)
result_high_customization = model_high_customization.fit()

model_low_customization = sm.Logit(y_low_customization, X_low_customization)
result_low_customization = model_low_customization.fit()

# Display the summaries of both models
high_competition_summary = result_high_customization.summary()
low_competition_summary = result_low_customization.summary()

high_competition_summary, low_competition_summary


                   Variable       VIF
0              scaled_price  1.057927
1     confidence_prediction  5.026022
2     politeness_prediction  3.631852
3  customization_prediction  8.898296
4          skill_prediction  4.150018
5  customization_confidence  5.002559
6  customization_politeness  5.312424
7       customization_skill  4.053584
8                 DDelivery  1.223460
9                      word  6.297591
                   Variable       VIF
0              scaled_price  1.006479
1     confidence_prediction  4.052643
2     politeness_prediction  3.087660
3  customization_prediction  8.835631
4          skill_prediction  3.407412
5  customization_confidence  4.288558
6  customization_politeness  5.111637
7       customization_skill  3.480577
8                 DDelivery  1.636293
9                      word  5.622995
Optimization terminated successfully.
         Current function value: 0.016926
         Iterations 11
Optimization terminated successfully.
         Current functi

(<class 'statsmodels.iolib.summary.Summary'>
 """
                            Logit Regression Results                           
 Dep. Variable:       std_award_status   No. Observations:                15486
 Model:                          Logit   Df Residuals:                    15475
 Method:                           MLE   Df Model:                           10
 Date:                Tue, 24 Sep 2024   Pseudo R-squ.:                 0.01582
 Time:                        20:13:34   Log-Likelihood:                -262.12
 converged:                       True   LL-Null:                       -266.34
 Covariance Type:            nonrobust   LLR p-value:                    0.5870
                                coef    std err          z      P>|z|      [0.025      0.975]
 --------------------------------------------------------------------------------------------
 const                       -5.9939      0.618     -9.702      0.000      -7.205      -4.783
 scaled_price             -4

# 之前的主模型

## 放入控制变量

In [None]:
import statsmodels.api as sm

# Selecting the variables for the regression analysis
dependent_variable = df['std_award_status']
independent_variables = df[['std_price', 'competition', 'confidence_prediction', 'skill_prediction',
                            'standardization_prediction', 'experience_prediction', 
                            'politeness_prediction', 'customization_prediction', 
                            'word', 'time_diff', 'DDelivery']]

# Adding a constant to the independent variables (for the intercept in the regression model)
independent_variables = sm.add_constant(independent_variables)

# Fitting the regression model
model = sm.OLS(dependent_variable, independent_variables).fit()

# Displaying the summary of the regression analysis
regression_summary = model.summary()
regression_summary


## 不放入控制变量

In [None]:
# Selecting the variables for the regression analysis
dependent_variable = df['std_award_status']
independent_variables = df[['std_price', 'competition', 'confidence_prediction', 'skill_prediction',
                            'standardization_prediction', 'experience_prediction', 
                            'politeness_prediction', 'customization_prediction']]

# Adding a constant to the independent variables (for the intercept in the regression model)
independent_variables = sm.add_constant(independent_variables)

# Fitting the regression model
model = sm.OLS(dependent_variable, independent_variables).fit()

# Displaying the summary of the regression analysis
regression_summary = model.summary()
regression_summary

## 创建高竞争、低竞争

In [None]:
# Defining competition_median as the median of the competition column, which represents the middle value of the distribution
competition_median = df['competition'].mean()
value_75_percentile = df['competition'].quantile(0.75)
value_25_percentile = df['competition'].quantile(0.25)

# Re-create the high_competition variable using this definition
df['high_competition'] = df['competition'].apply(lambda x: 1 if x > value_75_percentile else 0)
df['low_competition'] = df['competition'].apply(lambda x: 1 if x < value_25_percentile else 0)





In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Re-run the regression model with the updated high_competition variable
independent_variables = df[['std_price', 'confidence_prediction', 'skill_prediction',
                            'experience_prediction', 
                            'politeness_prediction', 'customization_prediction', 
                            'word', 'time_diff', 'DDelivery', 'high_competition']]
# 计算 VIF
vif_data = pd.DataFrame()
vif_data['Variable'] = independent_variables.columns
vif_data['VIF'] = [variance_inflation_factor(independent_variables.values, i) for i in range(independent_variables.shape[1])]

# 显示 VIF
print(vif_data)
# Adding a constant for the intercept
independent_variables = sm.add_constant(independent_variables)

# Fitting the new regression model
model_with_updated_competition = sm.OLS(dependent_variable, independent_variables).fit()

# Displaying the summary of the regression analysis
model_with_updated_competition_summary = model_with_updated_competition.summary()
model_with_updated_competition_summary



In [None]:
# Filtering the data for high competition projects only (high_competition = 1)
high_competition_data = df[df['high_competition'] == 1]

# Selecting the independent variables for this subset
independent_variables_high_competition = high_competition_data[['std_price', 'skill_prediction',
                                                               
                                                                'customization_prediction', 
                                                                ]]
# 计算 VIF
vif_data = pd.DataFrame()
vif_data['Variable'] = independent_variables_high_competition.columns
vif_data['VIF'] = [variance_inflation_factor(independent_variables_high_competition.values, i) for i in range(independent_variables_high_competition.shape[1])]

# 显示 VIF
print(vif_data)

# Adding a constant for the intercept
independent_variables_high_competition = sm.add_constant(independent_variables_high_competition)

# Defining the dependent variable for this subset
dependent_variable_high_competition = high_competition_data['std_award_status']

# Fitting the regression model for high competition data only
model_high_competition_subset = sm.OLS(dependent_variable_high_competition, independent_variables_high_competition).fit()

# Displaying the summary of the regression analysis
model_high_competition_subset_summary = model_high_competition_subset.summary()
model_high_competition_subset_summary


In [None]:
# Filtering the data for high competition projects only (high_competition = 1)
high_competition_data = df[df['high_competition'] == 1]

# Selecting the independent variables for this subset
independent_variables_high_competition = high_competition_data[['std_price',
                                                                'standardization_prediction',  'customization_prediction', 
                                                                'word', 'time_diff', 'DDelivery']]

# Adding a constant for the intercept
independent_variables_high_competition = sm.add_constant(independent_variables_high_competition)

# Defining the dependent variable for this subset
dependent_variable_high_competition = high_competition_data['std_award_status']

# Fitting the regression model for high competition data only
model_high_competition_subset = sm.OLS(dependent_variable_high_competition, independent_variables_high_competition).fit()

# Displaying the summary of the regression analysis
model_high_competition_subset_summary = model_high_competition_subset.summary()
model_high_competition_subset_summary


In [None]:
# Filtering the data for high competition projects only (high_competition = 1)
high_competition_data = df[df['low_competition'] == 1]

# Selecting the independent variables for this subset
independent_variables_high_competition = high_competition_data[['std_price',
                                                                'standardization_prediction',  'customization_prediction', 
                                                                'word', 'time_diff', 'DDelivery']]

# Adding a constant for the intercept
independent_variables_high_competition = sm.add_constant(independent_variables_high_competition)

# Defining the dependent variable for this subset
dependent_variable_high_competition = high_competition_data['std_award_status']

# Fitting the regression model for high competition data only
model_high_competition_subset = sm.OLS(dependent_variable_high_competition, independent_variables_high_competition).fit()

# Displaying the summary of the regression analysis
model_high_competition_subset_summary = model_high_competition_subset.summary()
model_high_competition_subset_summary
