
In this notebook, the saved classifier will be applied to a new dataset that has been preprocessed in the same way as the training dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.mstats import winsorize
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

In [2]:
clients = pd.read_csv('test_20092023.csv')
st = pd.read_csv('test_20092023_stax.csv')

st = st.rename(columns={"id_account": "id"})
df = pd.merge(clients, st, on="id")

df.head()

Unnamed: 0,uid,id,reg_platform,first_bet_date,used_promo,channel_final_name,revenue,first_bet_sum,m_win_cnt_bet,m_lose_cnt_bet,m_turnover,m_revenue,m_sum_dep,m_sum_out,m_cnt_dep,max
0,10008390000.0,213E5A35VSNXI,MobileSite,2024-04-13 19:35:28,0,direct,131777.27,2000.0,43.0,140.0,217124.41,131777.27,161959.0,30181.0,69.0,1
1,10008380000.0,216N59IXPYIC,MobileSite,2024-04-11 09:57:10,1,CPA,22545.0,3000.0,15.0,53.0,86176.5,22545.0,61305.0,38760.0,22.0,1
2,10008380000.0,217EWJG2K7T67,IOS,2024-04-10 13:53:14,1,CPM,2000.0,2000.0,0.0,1.0,2000.0,2000.0,2000.0,0.0,1.0,0
3,10008390000.0,21GLRSEQVZPPJ,MobileSite,2024-04-13 12:16:50,0,direct,4999.79,500.0,3.0,7.0,8500.0,4985.0,5000.0,0.0,1.0,1
4,10008380000.0,21IF7VL8YLIVG,IOS,2024-04-12 14:30:30,1,Отсутствует ICM,214100.0,100.0,0.0,1.0,100.0,100.0,100.0,0.0,1.0,0


In [3]:
df = df.rename(columns={"max": "max_wins"})

In [4]:
df['first_bet_date'] = pd.to_datetime(df['first_bet_date'])

In [5]:
df['first_day'] = df['first_bet_date'].dt.dayofweek

In [6]:
df['reg_platform'] = df['reg_platform'].str.replace('Windows', 'Site')

In [7]:
cfn = ['Отсутствует ICM', 'Тип ICM не указан', 'CPM', 'CPA', 'direct', 'organic', 'smm']

no_icm = "Отсутствует|нет в справочнике"
no_type = "не указан"
cpm = "CPM"
cpa = "CPA"
direct = "direct|Email|seo"
organic = "organic|other"
smm = "smm"

conditions = [
    (df['channel_final_name'].str.contains(no_icm)),
    (df['channel_final_name'].str.contains(no_type)),
    (df['channel_final_name'].str.contains(cpm)),
    (df['channel_final_name'].str.contains(cpa)),
    (df['channel_final_name'].str.contains(direct)),
    (df['channel_final_name'].str.contains(organic)),
    (df['channel_final_name'].str.contains(smm))
]

df['channel'] = np.select(conditions, cfn, default='other')

In [8]:
df = df.drop(columns=['channel_final_name', 'uid', 'id', 'first_bet_date'])

In [9]:
df['fbt_turnover'] = df['m_turnover'] / df['first_bet_sum']

In [20]:
df['identifier'] = df.index

In [11]:
df['revenue_class'] = pd.cut(df['revenue'], bins=[-float('inf'), 0, 10000, 50000, float('inf')], labels=[0, 1, 2, 3])


In [None]:
columns_to_winsorize = ['revenue']

for col in columns_to_winsorize:
    df[col] = winsorize(df[col], limits=[0.007, 0.007])

In [13]:
print(df['revenue'].max(), df['revenue'].min())

947769.59 -164442.6


In [14]:
df.head()

Unnamed: 0,reg_platform,used_promo,revenue,first_bet_sum,m_win_cnt_bet,m_lose_cnt_bet,m_turnover,m_revenue,m_sum_dep,m_sum_out,m_cnt_dep,max_wins,first_day,channel,fbt_turnover,identifier,revenue_class
0,MobileSite,0,131777.27,2000.0,43.0,140.0,217124.41,131777.27,161959.0,30181.0,69.0,1,5,direct,108.562205,0,3
1,MobileSite,1,22545.0,3000.0,15.0,53.0,86176.5,22545.0,61305.0,38760.0,22.0,1,3,CPA,28.7255,1,2
2,IOS,1,2000.0,2000.0,0.0,1.0,2000.0,2000.0,2000.0,0.0,1.0,0,2,CPM,1.0,2,1
3,MobileSite,0,4999.79,500.0,3.0,7.0,8500.0,4985.0,5000.0,0.0,1.0,1,5,direct,17.0,3,1
4,IOS,1,214100.0,100.0,0.0,1.0,100.0,100.0,100.0,0.0,1.0,0,4,Отсутствует ICM,1.0,4,3


In [21]:
X = df.drop(columns=['revenue', 'revenue_class'])
y = df['revenue_class']

In [22]:
# Save the indices to reproduce the dataset with predicted values later.
X_identifiers = X['identifier']
X = X.drop(columns=['identifier'])

In [16]:
# loading pre-made preprocessor
with open('preprocessor_class.pkl', 'rb') as f:
    preprocessor = pickle.load(f)

In [17]:
# Loading pre-trained model
with open('best_classifier_pipeline.pkl', 'rb') as f:
    best_model = pickle.load(f)

# Making prediction on new data
X_new = X
y_pred_class = best_model.predict(X_new)  # Preprocessing and predictions

In [18]:
print("Classification Accuracy:", accuracy_score(y, y_pred_class))

Classification Accuracy: 0.8832866479925303


In [23]:
# merge the DataFrame with the predicted classes and the actual classes to evaluate the errors.
predictions_df = pd.DataFrame({
    'identifier': X_identifiers,
    'Predicted_Revenue': y_pred_class,
    'Actual_Revenue': y.values
})

df_with_predictions = df.merge(predictions_df, on='identifier', how='left')

In [91]:
df_with_predictions.to_excel('classes_df.xlsx', index=False)

In [None]:
class_0_df = df_with_predictions[df_with_predictions['Predicted_Revenue'] == 1]
class_0_df = class_0_df.drop(columns=['Predicted_Revenue'])
class_0_df.to_csv('predicted_class_0_df.csv', index=False)