In [4]:

df = pd.read_csv('dfc.csv')
neg_df = df[df['review_emotion'] == 0].copy()

neg_df['target_is_tariff'] = neg_df['review_theme'].astype(str).str.lower().str.contains('тарифы и условия').astype(int)

neg_df['influencer_flg'] = neg_df['influencer_flg'].fillna(0).astype(int)

X = neg_df[['influencer_flg']]
X = sm.add_constant(X)
y = neg_df['target_is_tariff']

logit_model = sm.Logit(y, X.astype(float)).fit(disp=0)
print(logit_model.summary())


auc = roc_auc_score(y, logit_model.predict(X.astype(float)))
print(f"ROC AUC: {auc:.4f}")

                           Logit Regression Results                           
Dep. Variable:       target_is_tariff   No. Observations:                18952
Model:                          Logit   Df Residuals:                    18950
Method:                           MLE   Df Model:                            1
Date:                Sun, 14 Dec 2025   Pseudo R-squ.:                0.005477
Time:                        20:24:21   Log-Likelihood:                -7211.0
converged:                       True   LL-Null:                       -7250.7
Covariance Type:            nonrobust   LLR p-value:                 5.025e-19
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -2.0494      0.027    -75.917      0.000      -2.102      -1.996
influencer_flg     0.4138      0.046      9.050      0.000       0.324       0.503
ROC AUC: 0.5446


In [28]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score


df = pd.read_csv('dfc.csv')
negative_reviews = df[df['review_emotion'] == 0].copy()


top_lines_99 = [
    'депозитные продукты', 'кредитные карты', 'инвестиции', 'прочие продукты',
    'автокредит', 'mobile', 'sme','нефинансовые продукты','премиальные сервисы', 'ипотека'
]
top_sources_99 = [
    'banki.ru', 'пульс', 'нет', 'otzovik', 'sravni.ru',
    'пикабу', 'вконтакте', 'суперапп', 'asn'
]

cov_bl = negative_reviews['business_line'].isin(top_lines_99).mean()
cov_src = negative_reviews['review_source'].isin(top_sources_99).mean()

print(f"Покрытие Business Line: {cov_bl:.2%}")
print(f"Покрытие Review Source: {cov_src:.2%}")

negative_reviews['review_theme'] = negative_reviews['review_theme'].astype(str).str.strip()
negative_reviews['target_is_tariff'] = negative_reviews['review_theme'].str.lower().str.contains('тарифы и условия').astype(int)

negative_reviews['business_line_grouped'] = negative_reviews['business_line'].apply(lambda x: x if x in top_lines_99 else 'Other')
negative_reviews['source_grouped'] = negative_reviews['review_source'].apply(lambda x: x if x in top_sources_99 else 'Other')

negative_reviews['is_male'] = (negative_reviews['gender_cd'].fillna('Unknown') == 'M').astype(int)
negative_reviews['influencer_flg'] = negative_reviews['influencer_flg'].fillna(0).astype(int)
negative_reviews['new_flg'] = negative_reviews['new_flg'].fillna(0).astype(int)
negative_reviews['solution_flg'] = negative_reviews['solution_flg'].fillna('Unknown')
negative_reviews['is_mark_2'] = (negative_reviews['review_mark'].astype(str) == '2').astype(int)

cat_features = ['business_line_grouped', 'source_grouped', 'solution_flg']
num_features = ['is_male', 'influencer_flg', 'new_flg', 'is_mark_2']

model_df = negative_reviews[['target_is_tariff'] + cat_features + num_features].copy()
model_df = pd.get_dummies(model_df, columns=cat_features, drop_first=True)

y = model_df['target_is_tariff']
X = model_df.drop('target_is_tariff', axis=1)
X = sm.add_constant(X)

logit_model = sm.Logit(y, X.astype(float)).fit(method='bfgs', maxiter=1000, disp=0)
print(logit_model.summary())

y_prob = logit_model.predict(X.astype(float))
y_pred = (y_prob > 0.5).astype(int)

print(f"ROC AUC:  {roc_auc_score(y, y_prob):.4f}")
print(f"Accuracy: {accuracy_score(y, y_pred):.4f}")
print(f"F1 Score: {f1_score(y, y_pred):.4f}")

Покрытие Business Line: 91.97%
Покрытие Review Source: 97.85%
                           Logit Regression Results                           
Dep. Variable:       target_is_tariff   No. Observations:                18952
Model:                          Logit   Df Residuals:                    18928
Method:                           MLE   Df Model:                           23
Date:                Sun, 14 Dec 2025   Pseudo R-squ.:                  0.2724
Time:                        23:14:13   Log-Likelihood:                -5275.7
converged:                       True   LL-Null:                       -7250.7
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                  coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------
const                                          -2.8505      0.241    -11.810      

In [29]:
from sklearn.metrics import classification_report, precision_recall_curve
y_prob = logit_model.predict(X.astype(float))
print("Отчет для порога 0.5")
print(classification_report(y, (y_prob > 0.5).astype(int)))
precision, recall, thresholds = precision_recall_curve(y, y_prob)
f1_scores = 2 * recall * precision / (recall + precision)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print("Оптимизация")
print(f"Лучший порог (Best Threshold): {best_threshold:.4f}")
print(f"Максимальный F1 Score: {f1_scores[best_idx]:.4f}")
y_pred_new = (y_prob > best_threshold).astype(int)
print("Отчет для лучшего порога")
print(classification_report(y, y_pred_new))

Отчет для порога 0.5
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     16526
           1       0.83      0.33      0.47      2426

    accuracy                           0.91     18952
   macro avg       0.87      0.66      0.71     18952
weighted avg       0.90      0.91      0.89     18952

Оптимизация
Лучший порог (Best Threshold): 0.2507
Максимальный F1 Score: 0.5005
Отчет для лучшего порога
              precision    recall  f1-score   support

           0       0.92      0.95      0.94     16526
           1       0.58      0.44      0.50      2426

    accuracy                           0.89     18952
   macro avg       0.75      0.70      0.72     18952
weighted avg       0.88      0.89      0.88     18952

