In [3]:
import pandas as pd

# 1) 改成你的文件路径（例如 "./DMEFExtractSummaryV01.CSV"）
path = "DMEFExtractSummaryV01.CSV"

# 2) 读取（这个文件列很多，我先关掉低内存提示）
df = pd.read_csv(path, low_memory=False)

# 3) 快速确认读成功
print("Rows, Cols =", df.shape)
print(df.head(3))

Rows, Cols = (100051, 170)
   Cust_ID SCF_Code  RetF07Dollars  RetF07Trips  RetF07Lines  RetS07Dollars  \
0    22120      346              0            0            0              0   
1    24436                       0            0            0              0   
2    29278       85              0            0            0              0   

   RetS07Trips  RetS07Lines  RetF06Dollars  RetF06Trips  ...  IncCode  \
0            0            0             14            1  ...        8   
1            0            0              0            0  ...            
2            0            0             63            2  ...        9   

   HomeCode  Child0_2  Child3_5  Child6_11  Child12_16  Child17_18  Dwelling  \
0         2         N         N          N           N           N         1   
1                   N         N          N           N           N             
2         2         N         N          N           N           N         2   

   LengthRes  HomeValue  
0          8    

In [4]:
# 检查关键渠道变量是否存在
cols_to_check = ['RetF07Dollars', 'IntF07Orders', 'CatF07Orders']
print([col for col in cols_to_check if col in df.columns])

['RetF07Dollars', 'IntF07Orders', 'CatF07Orders']


In [5]:
# 判断是否使用渠道（大于0就算使用）
df['Used_Retail'] = df['RetF07Dollars'] > 0
df['Used_Internet'] = df['IntF07Orders'] > 0
df['Used_Catalog'] = df['CatF07Orders'] > 0

# 看看有多少人使用每个渠道
print("Retail users:", df['Used_Retail'].sum())
print("Internet users:", df['Used_Internet'].sum())
print("Catalog users:", df['Used_Catalog'].sum())

Retail users: 8841
Internet users: 6700
Catalog users: 6150


In [6]:
df[['RetF07Dollars','IntF07Orders','CatF07Orders']].isnull().sum()

RetF07Dollars    0
IntF07Orders     0
CatF07Orders     0
dtype: int64

In [7]:
df['Channel_Count'] = (
    df['Used_Retail'].astype(int) +
    df['Used_Internet'].astype(int) +
    df['Used_Catalog'].astype(int)
)

df['Channel_Count'].value_counts()

Channel_Count
0    78791
1    20833
2      423
3        4
Name: count, dtype: int64

In [9]:
# 是否“多渠道”（>=2）
df['Multi_Channel'] = df['Channel_Count'] >= 2
df['Multi_Channel'].value_counts()

Multi_Channel
False    99624
True       427
Name: count, dtype: int64

In [10]:
# 多渠道客户是否花更多钱？
df.groupby('Channel_Count')['RetF07Dollars'].mean()

Channel_Count
0     0.000000
1    30.965727
2    78.271868
3    94.000000
Name: RetF07Dollars, dtype: float64

In [13]:
cols = ['Channel_Count','IncCode','AgeCode','HomeValue']
print(df[cols].dtypes)

Channel_Count     int64
IncCode          object
AgeCode          object
HomeValue        object
dtype: object


In [14]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

cols = ['Channel_Count','IncCode','AgeCode','HomeValue']

# 转成数值，不能转的变成 NaN
for c in cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

print(df[cols].dtypes)
print(df[cols].isnull().sum())

Channel_Count      int64
IncCode          float64
AgeCode          float64
HomeValue        float64
dtype: object
Channel_Count        0
IncCode          34710
AgeCode          21977
HomeValue        16961
dtype: int64


In [16]:
reg_df = df[['RetF07Dollars'] + cols].dropna()

X = reg_df[cols]
y = reg_df['RetF07Dollars']

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          RetF07Dollars   R-squared:                       0.122
Model:                            OLS   Adj. R-squared:                  0.122
Method:                 Least Squares   F-statistic:                     2265.
Date:                Fri, 20 Feb 2026   Prob (F-statistic):               0.00
Time:                        15:29:58   Log-Likelihood:            -3.1114e+05
No. Observations:               65341   AIC:                         6.223e+05
Df Residuals:                   65336   BIC:                         6.223e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.2661      0.582     -0.457