In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_selector
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score
from datetime import datetime
import dill

In [2]:
df = pd.read_csv('ga_hits.csv')
df1 = pd.read_csv('ga_sessions.csv', low_memory=False)

In [3]:
df = df[['session_id', 'event_action']]
df = df.drop_duplicates(subset='session_id', keep='first')
df1 = df1.merge(df, how='left', on='session_id')
df1 = df1.drop(columns=['device_os', 'utm_keyword', 'device_model'], axis=1)

In [4]:
def target_change(y):
  target = ['sub_car_claim_click', 
            'sub_car_claim_submit_click', 
            'sub_open_dialog_click', 
            'sub_open_dialog_click', 
            'sub_custom_question_submit_click',
            'sub_submit_success',
            'sub_car_request_submit_click'
  ]
  if y in target:
    return 1
  return 0
    
  


def organic_add(y):
  organic = ['organic', 'referral', '(none)'] 
  if y in organic:
    return 1
  return 0  


def paid_add(y):
  paid = ['organic', 'referral', '(none)'] 
  if y in paid:
    return 0
  return 1

In [5]:
df1['event_action'] = df1['event_action'].apply(target_change)
df1['organic_traffic'] = df1['utm_medium'].apply(organic_add)
df1['paid_traffic'] = df1['utm_medium'].apply(paid_add)

In [6]:
def downsample(df1):
  df_min = df1[df1['event_action'] == 1]
  df_maj = df1[df1['event_action'] == 0]
  df_maj_downsample = resample(df_maj, replace=False, n_samples=len(df_min), random_state=42)
  df1 = pd.concat([df_maj_downsample, df_min], ignore_index=True).sample(frac=1.)
  
  return df1
df1 = downsample(df1)

In [7]:
def time_add(df1):
  df1["visit_date"] = pd.to_datetime(df1["visit_date"])
  df1['date_month'] = df1['visit_date'].apply(lambda x: x.month)
  df1['date_day'] = df1['visit_date'].apply(lambda x: x.day)
  return df1

def split(df1):
  split_df1 = df1['device_screen_resolution'].str.split('x',expand=True)
  split_df1.columns=['screen_height','screen_width']
  split_df1 = split_df1.dropna()
  split_df1 = split_df1.astype(int)
  df1 = df1.join(split_df1)
  return df1



def filter_data(df1):
  df1 = df1.drop(columns=['session_id', 'client_id', 'visit_date', 'visit_time', 'device_screen_resolution'], axis=1)
  return df1 


numeric_features = ['visit_number', 'date_month', 'date_day', 'screen_height','screen_width']


numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='other')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])


preprocessor = Pipeline(steps=[
    ('time', FunctionTransformer(time_add)),
    ('split', FunctionTransformer(split)),
    ('filter', FunctionTransformer(filter_data))
])


preprocessor2 = ColumnTransformer(transformers=[
    ('numerical', numerical_transformer, numeric_features),
    ('categorical', categorical_transformer, make_column_selector(dtype_include=object))
])


model = (
    LogisticRegression(max_iter=1000)
)


X = df1.drop(['event_action'], axis=1)
y = df1['event_action']


pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('preprocessor2', preprocessor2),
        ('classifier', model)
    ])


pipe.fit(X, y)
roc_auc = roc_auc_score(y, pipe.predict_proba(X)[:, 1])


with open('event_pipe.pkl', 'wb') as file:
    dill.dump({
        'model': pipe,
        'metadata': {
            'name': 'event action prediction model',
            'author': 'Kirill',
            'version': 1,
            'date': datetime.now(),
            'roc_auc': roc_auc
        }
    }, file)


print(roc_auc)

0.7185581766186973
