In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.compose import make_column_transformer
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sn

## NOTES

1. Данные Киви только частично пересекаются с нашими (примерно две трети наших пользователей есть у Киви), т.е. когда мы будем строить прод модель - придется для части пользователей которых нет у Киви использовать только нашу модель.
2. Там где есть пересечение данных Киви с нашими желательно построить модель, которая использовала бы сильные стороны обеих, пока модель, объединяющая в себе две модели работает хуже чем отдельно модель Киви
3. __Фичи__
    - желательно доработать фичу модели телефона и добавить новые фичи которые мы можем использовать.
    - регион Делимобиль
    - адрес прописки
    - место выдачи паспорта
    - категория прав
    - фичи из AppsFlyer (откуда пришел пользователь итд)
    - брэнд банка держателя карты
    - тип карты (кредит/не кредит, MasterCard, VISA итд)
    - пробивка по базам неплательщиков
    - координаты на момент регистрации (Инна - полигоны): многие регистрируются не дома, у половины не собираются координаты
    - наличие промокода при регистрации (DDS.d_user_promo)
    - адрес регистрации
    - имя/фамилия, напр.: Солихжон и Иван
4. __Тип модели__
    - сейчас для модели используется логарифмическая регрессия, возможно, другие модели (или ансамбль моделей) покажет себя лучше.
5. __Целевая переменная__
    - на данный момент используется не самая оптимальная целевая переменная. Есть смысл собрать датасет с продовой целевой переменной, но там уже модель будет строиться на других принципах (временных окон).
    

## PREPARING DATA

In [2]:
# OPENING RAW DATAFRAME
df = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v3_user_default/raw_data.csv')

# CLEANING AGE AND EXP VALUES
df['age'] = np.where(((df.age < 18)|(df.age>65)),np.nan,df.age)
df['exp'] = np.where(((df.exp < 0)|(df.exp>47)),np.nan,df.exp)
df = df.dropna(axis = 'rows', subset = ['age','exp'])

### PREPARING AND ORGANIZING CATEGORICAL DATAFRAMES

In [3]:
# PREPARING DEVICES DATAFRAME
devices_lib = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/devices_lib.csv')

In [4]:
# PREPARING KBM DATAFRAME
df['kbm_grouped'] = np.where(df['kbm']<0.7,'0.5+',\
                              np.where(df['kbm']<0.8,'0.7+',\
                                       np.where(df['kbm']<0.9,'0.8+',\
                                                np.where(df['kbm']<1,'0.9+',\
                                                         np.where(df['kbm'] == 1, '1',\
                                                                  np.where(df['kbm']<2.3,'1.4+',\
                                                                           np.where(df['kbm']>=2.3,'2.3+','?')))))))
df = df.replace('?', np.NaN)

In [5]:
# PREPARING BIRTH PLACE REGION DATAFRAME
bp = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/birthplaces_lib.csv')
bp = bp[['PassportBirthPlace','country','region']]
bp = bp.replace('None', np.nan)
bp = bp.dropna(axis='rows')
bp = bp.drop_duplicates(subset = ['PassportBirthPlace'])

# BIRTH PLACES LIB
bp_clsfied = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/birthplaces_classified_lib.csv')

# COUNTRIES
countries = bp_clsfied.drop_duplicates(subset = 'bp_country')
countries = pd.concat([countries.iloc[1:2],countries.iloc[3:]], axis='rows')
countries = countries[['bp_country', 'bp_region_group_detailed']]

# MERGING
bp = pd.merge(bp, bp_clsfied, left_on = 'region', right_on = 'bp_region_group_detailed', how = 'left')
bp = pd.merge(bp, countries, left_on = 'country', right_on = 'bp_country', how = 'left')

bp['bp_region_group_detailed'] = np.where(pd.isnull(bp.bp_region_group_detailed_x) == True,bp.bp_region_group_detailed_y,\
                                            bp.bp_region_group_detailed_x)
bp = bp[['PassportBirthPlace', 'bp_region_group_detailed']]

In [6]:
# PREPARING MOBILE OPERATORS DATAFRAME
mob = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/mobile_codes_lib.csv')

In [7]:
# PREPARING LICENSE CATEGORY DATAFRAME
lcns = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/data_lib/license_cat_lib.csv')

### ENRICHING ORIGINAL DATAFRAME WITH CLASSIFIED CATEGORICAL DATA

In [8]:
# DATA ENRICHMENT
df = pd.merge(df, devices_lib, left_on = 'device_type', right_on = 'device', how = 'left')
df = pd.merge(df, bp, left_on = 'birth_place', right_on = 'PassportBirthPlace', how = 'left')
df = pd.merge(df, mob, on = 'mobile_code', how = 'left')
df = pd.merge(df, lcns, on = 'license_category', how = 'left')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1527687 entries, 0 to 1527686
Data columns (total 30 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   for_exp                   1527687 non-null  object 
 1   id                        1527687 non-null  int64  
 2   phone                     1527687 non-null  int64  
 3   appl_date                 1527687 non-null  object 
 4   rn                        1527687 non-null  int64  
 5   qty                       1527687 non-null  int64  
 6   target_1_payment_rate     1527687 non-null  float64
 7   target_2_def_3_days       1527687 non-null  bool   
 8   target_3_def_30_days      1527687 non-null  bool   
 9   target_4_def_90_days      1527687 non-null  bool   
 10  group_name                1527687 non-null  object 
 11  device_type               1257996 non-null  object 
 12  kbm                       965415 non-null   float64
 13  mobile_code               1

In [9]:
# CALCULATING AGE OF DEVICE MODEL ON THE USER'S ACTIVATION DATE
df['appl_year'] = pd.DatetimeIndex(df.appl_date).year
df['device_age_at_appl_date']  = df.appl_year-df.device_release_year
df.device_age_at_appl_date = df.device_age_at_appl_date.astype('str')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1527687 entries, 0 to 1527686
Data columns (total 32 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   for_exp                   1527687 non-null  object 
 1   id                        1527687 non-null  int64  
 2   phone                     1527687 non-null  int64  
 3   appl_date                 1527687 non-null  object 
 4   rn                        1527687 non-null  int64  
 5   qty                       1527687 non-null  int64  
 6   target_1_payment_rate     1527687 non-null  float64
 7   target_2_def_3_days       1527687 non-null  bool   
 8   target_3_def_30_days      1527687 non-null  bool   
 9   target_4_def_90_days      1527687 non-null  bool   
 10  group_name                1527687 non-null  object 
 11  device_type               1257996 non-null  object 
 12  kbm                       965415 non-null   float64
 13  mobile_code               1

## MERGING DATASET WITH QIWI RESULTS

In [10]:
qiwi = pd.read_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/QIWI_test/deli_results_23112020.csv')
df = pd.merge(df, qiwi, left_on = 'id', right_on = 'ID', how = 'left')

## TESTING LOGISTIC REGRESSION ON REG DATA WITH QIWI SCORING

In [34]:
# LEAVING ONLY RECORDS THAT WERE USED FOR THE QIWI EXPERIMENT
df_exp = df[df.for_exp == 'for_experiment']
# df_exp = df.loc[df.ID.notnull()]

# LEAVING ONLY COLUMNS THAT'LL BE USED FOR THE MODEL
features = ['mobile_operator', 'sex', 'age', 'exp', 'bp_region_group_detailed', 'kbm_grouped', 'brand',\
            'device_age_at_appl_date', 'device_feature']
target = ['target_4_def_90_days']
group_name = ['group_name']
qiwi_cols = list(qiwi.columns)[1:]
df_exp = df_exp[features+target+group_name]

# REPLACING NA VALUES WITH 'NaN'
for feature in ['mobile_operator','sex','bp_region_group_detailed','kbm_grouped','brand','device_age_at_appl_date',\
                'device_feature']:
    df_exp[feature] = df_exp[feature].fillna('NaN')
    
# SPLITTING DATASET INTO X AND y
df_exp.target_4_def_90_days = df_exp.target_4_def_90_days*1
df_exp = df_exp.reset_index(drop=True)
X = df_exp.iloc[:,:-2]
y = df_exp.iloc[:,-2:]

# ONE-HOT ENCODING
enc = preprocessing.OneHotEncoder()
X_obj = X.loc[:, X.dtypes == object]
X_flt = X.loc[:, X.dtypes == float]
enc.fit(X_obj)
X = pd.DataFrame(enc.transform(X_obj).toarray())
X = X.join(X_flt).join(df_exp.group_name)

# SPLITTING X AND y TO TRAIN AND TEST SAMPLES
X_train = X.loc[X.group_name == 'test']
X_train = X_train.reset_index(drop = True)
X_train = X_train.iloc[:, :-1]

X_test = X.loc[X.group_name == 'control']
X_test = X_test.reset_index(drop = True)
X_test = X_test.iloc[:, :-1]

y_train = y.loc[y.group_name == 'test']
y_train = y_train.reset_index(drop = True)
y_train = y_train.iloc[:,:-1]

y_test = y.loc[y.group_name == 'control']
y_test = y_test.reset_index(drop = True)
y_test = y_test.iloc[:,:-1]

# CONVERTING X AND y DATAFRAMES TO ARRAYS
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

#LogisticRegression
logreg = LogisticRegression(solver='liblinear',class_weight='balanced',random_state=0,C=0.01).fit(X_train, y_train.ravel())
y_pred = logreg.predict(X_test)

y_pred_proba = logreg.predict_proba(X_test)
y_pred_proba1 = []
for i in y_pred_proba:
    y_pred_proba1.append(i[1])
    
print(classification_report(y_test,y_pred))
print(roc_auc_score(y_test, y_pred_proba1))

              precision    recall  f1-score   support

           0       0.99      0.63      0.77     87348
           1       0.05      0.72      0.10      2472

    accuracy                           0.64     89820
   macro avg       0.52      0.67      0.44     89820
weighted avg       0.96      0.64      0.75     89820

0.726376185823236


## TESTING COMBINED MODEL (OUR AND QIWI) PERFORMANCE

In [28]:
# LEAVING ONLY RECORDS THAT WERE USED FOR THE QIWI EXPERIMENT
# df_exp = df[df.for_exp == 'for_experiment']
df_exp = df.loc[df.ID.notnull()]

# LEAVING ONLY COLUMNS THAT'LL BE USED FOR THE MODEL
features = ['mobile_operator', 'sex', 'age', 'exp', 'bp_region_group_detailed', 'kbm_grouped', 'brand',\
            'device_age_at_appl_date', 'device_feature', 'pd_basis_60_v3_4']
target = ['target_4_def_90_days']
group_name = ['group_name']
qiwi_cols = list(qiwi.columns)[1:]
df_exp = df_exp[features+target+group_name]

# REPLACING NA VALUES WITH 'NaN'
for feature in ['mobile_operator','sex','bp_region_group_detailed','kbm_grouped','brand','device_age_at_appl_date',\
                'device_feature', 'pd_basis_60_v3_4']:
    df_exp[feature] = df_exp[feature].fillna('NaN')
    
# SPLITTING DATASET INTO X AND y
df_exp.target_4_def_90_days = df_exp.target_4_def_90_days*1
df_exp = df_exp.reset_index(drop=True)
X = df_exp.iloc[:,:-2]
y = df_exp.iloc[:,-2:]

# ONE-HOT ENCODING
enc = preprocessing.OneHotEncoder()
X_obj = X.loc[:, X.dtypes == object]
X_flt = X.loc[:, X.dtypes == float]
enc.fit(X_obj)
X = pd.DataFrame(enc.transform(X_obj).toarray())
X = X.join(X_flt).join(df_exp.group_name)

# SPLITTING X AND y TO TRAIN AND TEST SAMPLES
X_train = X.loc[X.group_name == 'test']
X_train = X_train.reset_index(drop = True)
X_train = X_train.iloc[:, :-1]

X_test = X.loc[X.group_name == 'control']
X_test = X_test.reset_index(drop = True)
X_test = X_test.iloc[:, :-1]

y_train = y.loc[y.group_name == 'test']
y_train = y_train.reset_index(drop = True)
y_train = y_train.iloc[:,:-1]

y_test = y.loc[y.group_name == 'control']
y_test = y_test.reset_index(drop = True)
y_test = y_test.iloc[:,:-1]

# CONVERTING X AND y DATAFRAMES TO ARRAYS
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

#LogisticRegression
logreg = LogisticRegression(solver='liblinear',class_weight='balanced',random_state=0,C=0.01).fit(X_train, y_train.ravel())
y_pred = logreg.predict(X_test)
# y_pred_proba = logreg.predict_proba(X_train)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.75      0.85     71166
           1       0.07      0.68      0.13      2061

    accuracy                           0.74     73227
   macro avg       0.53      0.71      0.49     73227
weighted avg       0.96      0.74      0.83     73227



## HYPER PARAMETERS TUNING

In [100]:
clsfr = GridSearchCV(LogisticRegression(solver = 'liblinear', class_weight = 'balanced'),{
    'C':[0.005, 0.01,],
#     'max_iter':[50,100,200],
#     'tol':[0.0001,0.00001,0.001],
#     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}, scoring = 'roc_auc')
clsfr.fit(X_train, y_train.ravel())
clsfr.cv_results_

{'mean_fit_time': array([0.84308958, 0.94939451]),
 'std_fit_time': array([0.02270155, 0.03068124]),
 'mean_score_time': array([0.01751966, 0.01804528]),
 'std_score_time': array([0.00077105, 0.00148475]),
 'param_C': masked_array(data=[0.005, 0.01],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.005}, {'C': 0.01}],
 'split0_test_score': array([0.69264778, 0.6934123 ]),
 'split1_test_score': array([0.73201843, 0.73244397]),
 'split2_test_score': array([0.70175886, 0.70297515]),
 'split3_test_score': array([0.70532725, 0.70757677]),
 'split4_test_score': array([0.68102601, 0.68025865]),
 'mean_test_score': array([0.70255567, 0.70333337]),
 'std_test_score': array([0.01695915, 0.0172999 ]),
 'rank_test_score': array([2, 1])}

In [84]:
logreg.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

## OBSERVING FEATURE IMPORTANCE

In [36]:
coefs = pd.DataFrame(logreg.coef_)
coefs.columns = list(enc.get_feature_names())+['age','exp']

# BRINGING AGE AND EXP COEFS TO COMPARABLE FORMAT
coefs.age_std = np.std(X.age)
coefs.exp_std = np.std(X.exp)
coefs.age = coefs.age*coefs.age_std
coefs.exp = coefs.exp*coefs.exp_std

coefs = coefs.T
print(coefs.to_string())
coefs.to_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/v3_user_default/feature_importance.csv', encoding = 'utf-8-sig')

                                               0
x0_Билайн                               0.088816
x0_Др.                                  0.013519
x0_МТС                                 -0.087904
x0_Мегафон                             -0.132809
x0_Теле2                                0.214930
x1_Ж                                   -0.345773
x1_М                                    0.442324
x2_NaN                                  0.338059
x2_Азербайджан                          0.026188
x2_Алтайский край                      -0.464007
x2_Амурская область                     0.060873
x2_Армения                              0.247897
x2_Архангельская область               -0.107002
x2_Астраханская область                -0.099036
x2_Беларусь                             0.033511
x2_Белгородская область                 0.027387
x2_Брянская область                    -0.266835
x2_Владимирская область                -0.126496
x2_Волгоградская область               -0.020077
x2_Вологодская облас

## COUPLED RANDOM FOREST CLASSIFIER OF QIWI MODEL AND THE MODEL BASED ON OUR INNER DATA

In [80]:
rfc = RandomForestClassifier(n_estimators = 300, max_depth = 4, random_state=0, class_weight = 'balanced').fit(X_train, y_train)
y_pred = rfc.predict(X_test)
# y_pred_proba = logreg.predict_proba(X_train)
print(classification_report(y_test,y_pred))

  """Entry point for launching an IPython kernel.


              precision    recall  f1-score   support

           0       0.99      0.62      0.76     71166
           1       0.05      0.71      0.09      2061

    accuracy                           0.62     73227
   macro avg       0.52      0.66      0.43     73227
weighted avg       0.96      0.62      0.74     73227



In [83]:
#Grid
parameters = {'n_estimators':[100, 150, 200, 250, 300], 'max_depth':[2, 3, 4, 5], 'class_weight':['balanced']}
grid = GridSearchCV(rfc, parameters).fit(X_train, y_train)
y_pred = grid.predict(X_test)
print(classification_report(y_test,y_pred))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


              precision    recall  f1-score   support

           0       0.99      0.62      0.76     71166
           1       0.05      0.71      0.10      2061

    accuracy                           0.62     73227
   macro avg       0.52      0.67      0.43     73227
weighted avg       0.96      0.62      0.74     73227



## CHECKING QIWI MODEL PERFORMANCE

In [168]:
q = df.loc[df.ID.notnull()]

y_train_q_pred = q.loc[q.group_name == 'test']
y_train_q_pred = y_train_q_pred.pd_basis_60_v3_4
y_train_q_pred = y_train_q_pred.reset_index(drop=True)
y_train_q_pred = y_train_q_pred.values

y_test_q_pred = q.loc[q.group_name == 'control']
y_test_q_pred = y_test_q_pred.pd_basis_60_v3_4
y_test_q_pred = y_test_q_pred.reset_index(drop=True)
y_test_q_pred = y_test_q_pred.values

print(roc_auc_score(y_train, y_train_q_pred))
print(roc_auc_score(y_test, y_test_q_pred))

0.7844592576775913
0.7777924157694709


## CREATING SAMPLE FOR DATADEV

In [11]:
df_exp = df.loc[df.ID.notnull()]

exp_control = df_exp.loc[df_exp.group_name == 'control']
exp_control = exp_control[['target_4_def_90_days', 'ID']]
exp_test = df_exp.loc[df_exp.group_name == 'test']
exp_test = exp_test[['target_4_def_90_days', 'ID']]

control_gb = exp_control.groupby('target_4_def_90_days').count()
control_gb['%'] = control_gb['ID']/control_gb['ID'].sum()
control_gb['control_n'] = control_gb['%']*7000
control_gb['control_n_EGRN'] = control_gb['%']*700

test_gb = exp_test.groupby('target_4_def_90_days').count()
test_gb['%'] = test_gb['ID']/test_gb['ID'].sum()
test_gb['test_n'] = test_gb['%']*3000
test_gb['test_n_EGRN'] = test_gb['%']*300

n = control_gb.merge(test_gb, on = 'target_4_def_90_days', how = 'left')
n = n[['control_n', 'control_n_EGRN', 'test_n', 'test_n_EGRN']]
n

Unnamed: 0_level_0,control_n,control_n_EGRN,test_n,test_n_EGRN
target_4_def_90_days,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,6802.982506,680.298251,2916.626695,291.662669
True,197.017494,19.701749,83.373305,8.337331


In [None]:
sample = df_exp[['id', 'group_name', 'target_4_def_90_days']]
sample = sample.reset_index(drop = True)

control_0 = sample.loc[(sample.group_name == 'control')&(sample.target_4_def_90_days == False)]
control_1 = sample.loc[(sample.group_name == 'control')&(sample.target_4_def_90_days == True)]
test_0 = sample.loc[(sample.group_name == 'test')&(sample.target_4_def_90_days == False)]
test_1 = sample.loc[(sample.group_name == 'test')&(sample.target_4_def_90_days == True)]

sample_control_0 = control_0.sample(n=7143)
sample_control_1 = control_1.sample(n=207)
sample_test_0 = test_0.sample(n=3062)
sample_test_1 = test_1.sample(n=88)

datadev = ((sample_control_0.append(sample_control_1)).append(sample_test_0)).append(sample_test_1)
datadev = datadev.reset_index(drop = True)
datadev.to_csv('C:/Users/sgulbin/Work/Analysis/Платежеспособность/datadev/datadev_sample.csv',\
               index=False, encoding = 'utf-8-sig')