In [1]:
from warnings import filterwarnings
from datetime import datetime as dt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
import pickle
import pandas as pd
import numpy as np
import gc
from datetime import datetime as dt
filterwarnings('ignore')

In [2]:
datatypes = {'session_id': 'string',
 'client_id': 'string',
 'visit_date': 'string',
 'visit_time': 'string',
 'visit_number': 'int64',
 'utm_source': 'string',
 'utm_medium':'string',
 'utm_campaign':'string',
 'utm_adcontent':'string',
 'utm_keyword':'string',
 'device_category':'string',
 'device_os':'string',
 'device_brand':'string',
 'device_model':'string',
 'device_screen_resolution': 'string',
 'device_browser':'string',
 'geo_country':'string',
 'geo_city':'string',
 'target_event': 'int64'}

df = pd.read_csv("D:\\Libraries\\Projects\\Skillbox.FinalProject\\data\\stage#1.csv", index_col=False, dtype=datatypes )
df = df.loc[:, ~df.columns.str.contains('Unnamed')]

In [3]:
def view_occupancy(df):
    length = len(df)
    dic = dict()
    for col in df.columns:
        percent_filled = (length-df[col].isna().sum())/length
        dic[col] = percent_filled

    dic = sorted(dic.items(), key=lambda x: x[1], reverse=True)
    for k, v in dic:
        print('{}: {:.3f}%, {}'.format(k, v*100, df[k].isna().sum()))

view_occupancy(df)

session_id: 100.000%, 0
client_id: 100.000%, 0
visit_date: 100.000%, 0
visit_time: 100.000%, 0
visit_number: 100.000%, 0
utm_medium: 100.000%, 0
device_category: 100.000%, 0
device_screen_resolution: 100.000%, 0
device_browser: 100.000%, 0
geo_country: 100.000%, 0
geo_city: 100.000%, 0
target_event: 100.000%, 0
utm_source: 99.995%, 81
utm_campaign: 88.643%, 202378
utm_adcontent: 82.450%, 312744
device_brand: 79.887%, 358403
device_os: 41.191%, 1047964
utm_keyword: 40.802%, 1054898
device_model: 0.864%, 1766572


In [4]:
df = df.drop(['session_id', 'client_id', 'utm_keyword', 'device_model'], axis=1)

for col in df.columns:
    df[col] = df[col].fillna('unknown')

In [5]:
best_brands = ['Samsung', 'Xiaomi', 'Huawei', 'POCO']

dev_types = ['mobile', 'tablet']

android_brands = ['Xiaomi', 'Samsung', 'Huawei', 'Vivo', 'LG',
       'BQ', 'OnePlus', 'itel', 'Asus', 'Sony', 'Android', 'OPPO',
       'Nokia', 'Realme', 'Alcatel', 'Infinix', 'Micromax', 'Blackview',
       'Oukitel', 'ZTE', 'Motorola', 'HOMTOM', 'Google', 'Lenovo',
       'China Phone', 'Neffos', 'Tecno', 'DOOGEE', 'Vsmart', 'Meizu', 'Umidigi', 'Leagoo', 'Highscreen', 'Ulefone', 'HTC',
       'Cubot', 'Symphony', 'Vestel', 'Guophone', 'Wileyfox', 'Vertex',
       'POCO', 'TCL', 'SonyEricsson', 'Nomu', 'Prestigio',
       'Philips', 'LeEco', 'DEXP', 'Hisense', 'Vernee', 'Tele2', 'CAT',
       'H96', 'Kyocera', 'Inoi', 'Acer', 'Kingplay', 'Sharp',
       'MTC', 'iNew', 'Karbonn', 'Wiko', 'Gionee', 'Digma',
       'Yota Devices', 'Kata', 'Poptel', 'Flylion', 'AGM', 'Teclast',
       'Fly', 'MLS', 'LeTV', 'Nuu', 'Celkon', 'Chuwi', 'Walton',
       'Haier', 'Coolpad', 'Xiaolajiao', 'Lava', 'General Mobile',
       'Black Fox', 'Condor', 'Razer', 'Microsoft', 'Fujitsu',
       'Alldocube', 'ZOJI', 'Amazon', 'PPTV', 'Jiake', 'Mito', 'Oysters',
       'NOA', 'Caterpillar', 'SenseIT', 'Verizon', 'Komu', 'BLU', 'RCA',
       'Cube', 'AT&T', 'Tonbux', 'Dark', 'Elephone', 'T-Mobile',
       'Smartfren', 'Billion', 'Gome', 'A1', 'MTS', 'Geotel', 'BLUBOO',
       'Ananda', 'MediaTek', 'KingSing', 'Ark', 'Essential',
       'Dragon Touch', 'E&L', 'Fero',  'Wigor', 'Tanix',
       'Mlais', 'InFocus', 'Smartisan', 'Beelink', 'Unihertz', 'Orbic',
       'Honeywell', 'Archos', 'RED', 'Sonim', 'Jiayu', 'Evertek',
       'Leegoog', 'QMobile', 'INUI', 'Artel', 'Irbis', 'China Mobile',
       'Flexymove', 'Olla', 'Krip', 'Star', 'UMI', 'Iris', 'Ellipsis',
       'LTC', 'Motive', 'Advan', 'M-HORSE', 'Winnovo', 'ThL', 'Nomi',
       'Wings Mobile', 'MXQ', 'How', 'Centric', 'Maze', 'Corn',
       'China TVBox', 'Gretel', 'Vertu', 'Evercoss', 'teXet', 'Qbex',
       'Panasonic', 'Tagital', 'Dell', 'Maxvi']

def foo_os(x):
    if x['device_os']=='unknown':
        if x['device_brand'] in android_brands:
            return 'Android'
        elif (x['device_brand'] == 'Apple') and (x.device_category in dev_types):
            return 'iOS'
    elif x['device_os'] != x['device_os']:
        if (x['device_category'] in dev_types) and (x['device_brand']=='Apple'):
            return 'iOS'
        elif x['device_brand'] in android_brands:
            return 'Android'
    else:
        return x['device_os']
    return x['device_os']
    
df['device_os'] = df.apply(lambda x: foo_os(x), axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler

def screen_resolution2diag(x):
    l = str(x).split('x')
    return int(l[0])*int(l[1])

df['device_screen_resolution'] = df['device_screen_resolution'].apply(lambda a: '1600x2000' if a == '1600x20000' else a)
df['device_screen_resolution'] = df['device_screen_resolution'].apply(lambda x: screen_resolution2diag(x))
df['device_screen_resolution'] = df['device_screen_resolution'].astype(int)

In [27]:
scaler = StandardScaler()
scaler.fit(df[['device_screen_resolution']])
scaled = scaler.transform(df[['device_screen_resolution']])
df['device_screen_resolution'] = scaled
df.head()

Unnamed: 0,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,device_category,device_os,device_brand,device_screen_resolution,device_browser,geo_country,geo_city,target_event,dev_screen_diag
0,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,mobile,Android,Huawei,-0.504989,Chrome,Russia,Zlatoust,0,259200
1,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,mobile,Android,Samsung,-0.404546,Samsung Internet,Russia,Moscow,0,328790
2,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,mobile,Android,Huawei,-0.504989,Chrome,Russia,Krasnoyarsk,0,259200
3,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,unknown,NOBKLgtuvqYWkXQHeYWM,mobile,Android,Xiaomi,-0.433257,Chrome,Russia,Moscow,0,308898
4,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,unknown,unknown,mobile,Android,Xiaomi,-0.433257,Chrome,Russia,Moscow,0,308898


In [None]:
def change_feature(x):
    grouped = df.groupby([x])['target_event'].mean().to_frame().reset_index()
    df[x]=df[x].apply(lambda a: grouped.loc[grouped[x]==a, 'target_event'].iloc[0])
    df[x]=df[x].astype(np.float64)

In [None]:
%%time
# change_feature('utm_source')
# change_feature('utm_medium')
# change_feature('utm_campaign')
# change_feature('utm_adcontent')
# change_feature('utm_keyword')
# change_feature('device_category')
# change_feature()

cat_features = df.select_dtypes(include=['string', 'object', 'bool']).columns.tolist()
for feature in cat_features:
    change_feature(feature)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('dark_background')
fit, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(df.corr(), ax=ax)

In [None]:
df_plt = tmp.groupby(['utm_source'])['target_event'].mean().to_frame().reset_index()

plt.figure()
plt.scatter(df_plt['utm_source'], df_plt['target_event'])
plt.title("Зависимость целевого действия от использования соцсетей")
plt.xlabel('Использование соцсетей')
plt.ylabel('Целевое действие')

plt.show();

In [None]:
cat_features