In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

df = pd.read_csv('dfc.csv')
negative_reviews = df[df['review_emotion'] == 0].copy()
negative_reviews['review_theme'] = negative_reviews['review_theme'].astype(str).str.strip()
negative_reviews['target_is_tariff'] = negative_reviews['review_theme'].str.lower().str.contains('тарифы и условия').astype(int)
top_lines = ['депозитные продукты', 'кредитные карты', 'инвестиции']
negative_reviews['business_line_grouped'] = negative_reviews['business_line'].apply(
    lambda x: x if x in top_lines else 'Other'
)
top_sources = ['banki.ru', 'пульс']
negative_reviews['source_grouped'] = negative_reviews['review_source'].apply(
    lambda x: x if x in top_sources else 'Other'
)
negative_reviews['gender_cd'] = negative_reviews['gender_cd'].fillna('Unknown')
negative_reviews['is_male'] = (negative_reviews['gender_cd'] == 'M').astype(int)
negative_reviews['is_profitable'] = negative_reviews['is_profitable'].fillna(False).astype(int)
negative_reviews['solution_flg'] = negative_reviews['solution_flg'].fillna('Unknown')
negative_reviews['influencer_flg'] = negative_reviews['influencer_flg'].fillna(0).astype(int)
negative_reviews['new_flg'] = negative_reviews['new_flg'].fillna(0).astype(int)
cat_features = ['business_line_grouped', 'source_grouped', 'solution_flg']
num_features = ['is_male', 'influencer_flg', 'new_flg']

model_df = negative_reviews[['target_is_tariff'] + cat_features + num_features].copy()
model_df = pd.get_dummies(model_df, columns=cat_features, drop_first=True)

y = model_df['target_is_tariff']
X = model_df.drop('target_is_tariff', axis=1)
X = sm.add_constant(X)
logit_model = sm.Logit(y, X.astype(float)).fit(method='newton', maxiter=100, disp=0)

print(logit_model.summary())

y_pred_probs = logit_model.predict(X.astype(float))
auc_score = roc_auc_score(y, y_pred_probs)

print("\n" + "="*30)
print(f"ROC AUC Score: {auc_score:.4f}")
print("="*30)

                           Logit Regression Results                           
Dep. Variable:       target_is_tariff   No. Observations:                18952
Model:                          Logit   Df Residuals:                    18942
Method:                           MLE   Df Model:                            9
Date:                Sun, 14 Dec 2025   Pseudo R-squ.:                 0.08142
Time:                        20:33:45   Log-Likelihood:                -6660.4
converged:                       True   LL-Null:                       -7250.7
Covariance Type:            nonrobust   LLR p-value:                1.811e-248
                                                coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------
const                                        -0.9723      0.060    -16.122      0.000      -1.090      -0.854
is_male                               

In [4]:

df = pd.read_csv('dfc.csv')
neg_df = df[df['review_emotion'] == 0].copy()

neg_df['target_is_tariff'] = neg_df['review_theme'].astype(str).str.lower().str.contains('тарифы и условия').astype(int)

neg_df['influencer_flg'] = neg_df['influencer_flg'].fillna(0).astype(int)

X = neg_df[['influencer_flg']]
X = sm.add_constant(X)
y = neg_df['target_is_tariff']

logit_model = sm.Logit(y, X.astype(float)).fit(disp=0)
print(logit_model.summary())


auc = roc_auc_score(y, logit_model.predict(X.astype(float)))
print(f"ROC AUC: {auc:.4f}")

                           Logit Regression Results                           
Dep. Variable:       target_is_tariff   No. Observations:                18952
Model:                          Logit   Df Residuals:                    18950
Method:                           MLE   Df Model:                            1
Date:                Sun, 14 Dec 2025   Pseudo R-squ.:                0.005477
Time:                        20:24:21   Log-Likelihood:                -7211.0
converged:                       True   LL-Null:                       -7250.7
Covariance Type:            nonrobust   LLR p-value:                 5.025e-19
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -2.0494      0.027    -75.917      0.000      -2.102      -1.996
influencer_flg     0.4138      0.046      9.050      0.000       0.324       0.503
ROC AUC: 0.5446


In [10]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score

# 1. Загрузка и подготовка (Ваш код)
df = pd.read_csv('dfc.csv')
negative_reviews = df[df['review_emotion'] == 0].copy()
negative_reviews['review_theme'] = negative_reviews['review_theme'].astype(str).str.strip()
negative_reviews['target_is_tariff'] = negative_reviews['review_theme'].str.lower().str.contains('тарифы и условия').astype(int)

# Ваши группировки
top_lines = ['депозитные продукты', 'кредитные карты', 'инвестиции']
negative_reviews['business_line_grouped'] = negative_reviews['business_line'].apply(
    lambda x: x if x in top_lines else 'Other'
)
top_sources = ['banki.ru', 'пульс']
negative_reviews['source_grouped'] = negative_reviews['review_source'].apply(
    lambda x: x if x in top_sources else 'Other'
)

# Заполнение пропусков
negative_reviews['gender_cd'] = negative_reviews['gender_cd'].fillna('Unknown')
negative_reviews['is_male'] = (negative_reviews['gender_cd'] == 'M').astype(int)
negative_reviews['is_profitable'] = negative_reviews['is_profitable'].fillna(False).astype(int)
negative_reviews['solution_flg'] = negative_reviews['solution_flg'].fillna('Unknown')
negative_reviews['influencer_flg'] = negative_reviews['influencer_flg'].fillna(0).astype(int)
negative_reviews['new_flg'] = negative_reviews['new_flg'].fillna(0).astype(int)

# Дополнительно: подготовка полей для разбивки (чтобы не терять данные при итерации)
negative_reviews['age_segment'] = negative_reviews['age_segment'].fillna('Unknown')
negative_reviews['citizenship_group'] = negative_reviews['citizenship_country'].apply(lambda x: 'RF' if x == 'РФ' else 'Other')
negative_reviews['segment_name'] = negative_reviews['segment_name'].fillna('Unknown')

# 2. Функция для обучения вашей модели на подвыборке
def run_model_on_subset(subset_df, subset_name):
    # Ваши фичи
    cat_features = ['business_line_grouped', 'source_grouped', 'solution_flg']
    num_features = ['is_male', 'influencer_flg', 'new_flg']

    # Проверка: если в подвыборке переменная стала константой (например, все женщины), удаляем её
    valid_num_features = [f for f in num_features if subset_df[f].nunique() > 1]

    # Формируем датасет
    model_cols = ['target_is_tariff'] + cat_features + valid_num_features
    # Если каких-то категорий нет в подвыборке, get_dummies просто создаст меньше колонок — это ок для теста
    model_df = pd.get_dummies(subset_df[model_cols], columns=cat_features, drop_first=True)

    y = model_df['target_is_tariff']
    X = model_df.drop('target_is_tariff', axis=1)

    # Удаляем константные dummy-колонки (защита от Singular Matrix)
    X = X.loc[:, (X != X.iloc[0]).any()]
    X = sm.add_constant(X)

    try:
        # Обучение (Ваш метод)
        model = sm.Logit(y, X.astype(float)).fit(method='newton', maxiter=100, disp=0)

        # Метрики
        auc = roc_auc_score(y, model.predict(X.astype(float)))

        # Смотрим на эффект Инфлюенсера (если он остался в модели)
        if 'influencer_flg' in model.params:
            or_val = np.exp(model.params['influencer_flg'])
            pval = model.pvalues['influencer_flg']
        else:
            or_val = np.nan
            pval = np.nan

        return {
            'Segment': subset_name,
            'N_rows': len(subset_df),
            'AUC': auc,
            'Influencer_OddsRatio': or_val, # Во сколько раз чаще жалуются инфлюенсеры
            'P-value': pval
        }
    except Exception as e:
        return {'Segment': subset_name, 'Error': str(e)}

# 3. Запуск проверки по разным срезам
results = []

# Глобальная модель (база)
results.append(run_model_on_subset(negative_reviews, 'GLOBAL (All)'))

# По Возрасту
for seg in negative_reviews['age_segment'].unique():
    subset = negative_reviews[negative_reviews['age_segment'] == seg]
    if len(subset) > 100:
        results.append(run_model_on_subset(subset, f'Age: {seg}'))

for seg in negative_reviews['citizenship_group'].unique():
    subset = negative_reviews[negative_reviews['citizenship_group'] == seg]
    results.append(run_model_on_subset(subset, f'Citizen: {seg}'))

res_df = pd.DataFrame(results).set_index('Segment')
print(res_df[['N_rows', 'AUC', 'Influencer_OddsRatio', 'P-value']].round(4))



                   N_rows     AUC  Influencer_OddsRatio  P-value
Segment                                                         
GLOBAL (All)        18952  0.7121                1.7869   0.0000
Age: Молодежь        4199  0.7826                1.7818   0.0000
Age: Взрослые       10183  0.6822                1.7554   0.0000
Age: Не определен     871  0.7088                1.4087   0.1576
Age: Сеньоры         2303  0.6903                2.0159   0.0000
Age: Подростки       1123  0.8478                3.1945   0.0013
Age: Unknown          262  0.7661                2.3561   0.2564
Citizen: RF         18509  0.7075                1.7486   0.0000
Citizen: Other        443  0.8931                5.1721   0.0001




In [14]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score
df = pd.read_csv('dfc.csv')
negative_reviews = df[df['review_emotion'] == 0].copy()
negative_reviews['review_theme'] = negative_reviews['review_theme'].astype(str).str.strip()
negative_reviews['target_is_tariff'] = negative_reviews['review_theme'].str.lower().str.contains('тарифы и условия').astype(int)

top_lines = ['депозитные продукты', 'кредитные карты', 'инвестиции']
negative_reviews['business_line_grouped'] = negative_reviews['business_line'].apply(
    lambda x: x if x in top_lines else 'Other'
)
top_sources = ['banki.ru', 'пульс', 'otzovik', 'sravni.ru']
negative_reviews['source_grouped'] = negative_reviews['review_source'].apply(
    lambda x: x if x in top_sources else 'Other'
)

negative_reviews['is_male'] = (negative_reviews['gender_cd'].fillna('Unknown') == 'M').astype(int)
negative_reviews['influencer_flg'] = negative_reviews['influencer_flg'].fillna(0).astype(int)
negative_reviews['new_flg'] = negative_reviews['new_flg'].fillna(0).astype(int)
negative_reviews['solution_flg'] = negative_reviews['solution_flg'].fillna('Unknown')
negative_reviews['is_mark_2'] = (negative_reviews['review_mark'].astype(str) == '2').astype(int)
cat_features = ['business_line_grouped', 'source_grouped', 'solution_flg']
num_features = ['is_male', 'influencer_flg', 'new_flg', 'is_mark_2']

model_df = negative_reviews[['target_is_tariff'] + cat_features + num_features].copy()
model_df = pd.get_dummies(model_df, columns=cat_features, drop_first=True)

y = model_df['target_is_tariff']
X = model_df.drop('target_is_tariff', axis=1)
X = sm.add_constant(X)
logit_model = sm.Logit(y, X.astype(float)).fit(method='newton', maxiter=100, disp=0)

print(logit_model.summary())
auc = roc_auc_score(y, logit_model.predict(X.astype(float)))
print(f"\nROC AUC Score: {auc:.4f}")

                           Logit Regression Results                           
Dep. Variable:       target_is_tariff   No. Observations:                18952
Model:                          Logit   Df Residuals:                    18939
Method:                           MLE   Df Model:                           12
Date:                Sun, 14 Dec 2025   Pseudo R-squ.:                 0.09660
Time:                        21:07:06   Log-Likelihood:                -6550.2
converged:                       True   LL-Null:                       -7250.7
Covariance Type:            nonrobust   LLR p-value:                8.927e-293
                                                coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------
const                                        -0.5816      0.065     -8.921      0.000      -0.709      -0.454
is_male                               