In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

df = pd.read_csv('dfc.csv')
negative_reviews = df[df['review_emotion'] == 0].copy()
negative_reviews['review_theme'] = negative_reviews['review_theme'].astype(str).str.strip()
negative_reviews['target_is_tariff'] = negative_reviews['review_theme'].str.lower().str.contains('тарифы и условия').astype(int)
top_lines = ['депозитные продукты', 'кредитные карты', 'инвестиции']
negative_reviews['business_line_grouped'] = negative_reviews['business_line'].apply(
    lambda x: x if x in top_lines else 'Other'
)
top_sources = ['banki.ru', 'пульс']
negative_reviews['source_grouped'] = negative_reviews['review_source'].apply(
    lambda x: x if x in top_sources else 'Other'
)
negative_reviews['gender_cd'] = negative_reviews['gender_cd'].fillna('Unknown')
negative_reviews['is_male'] = (negative_reviews['gender_cd'] == 'M').astype(int)
negative_reviews['is_profitable'] = negative_reviews['is_profitable'].fillna(False).astype(int)
negative_reviews['solution_flg'] = negative_reviews['solution_flg'].fillna('Unknown')
negative_reviews['influencer_flg'] = negative_reviews['influencer_flg'].fillna(0).astype(int)
negative_reviews['new_flg'] = negative_reviews['new_flg'].fillna(0).astype(int)
cat_features = ['business_line_grouped', 'source_grouped', 'solution_flg']
num_features = ['is_male', 'influencer_flg', 'new_flg']

model_df = negative_reviews[['target_is_tariff'] + cat_features + num_features].copy()
model_df = pd.get_dummies(model_df, columns=cat_features, drop_first=True)

y = model_df['target_is_tariff']
X = model_df.drop('target_is_tariff', axis=1)
X = sm.add_constant(X)
logit_model = sm.Logit(y, X.astype(float)).fit(disp=0, method='bfgs')

print(logit_model.summary())

y_pred_probs = logit_model.predict(X.astype(float))
auc_score = roc_auc_score(y, y_pred_probs)

print("\n" + "="*30)
print(f"ROC AUC Score: {auc_score:.4f}")
print("="*30)

                           Logit Regression Results                           
Dep. Variable:       target_is_tariff   No. Observations:                18952
Model:                          Logit   Df Residuals:                    18942
Method:                           MLE   Df Model:                            9
Date:                Sun, 14 Dec 2025   Pseudo R-squ.:                 0.08116
Time:                        20:14:38   Log-Likelihood:                -6662.2
converged:                      False   LL-Null:                       -7250.7
Covariance Type:            nonrobust   LLR p-value:                1.119e-247
                                                coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------
const                                        -0.9369      0.060    -15.584      0.000      -1.055      -0.819
is_male                               

