# Финальный ноутбук проекта

### Описание данных

GA Sessions:
###### Одна строка = один визит на сайт.
- session_id — ID визита;
- client_id — ID посетителя;
- visit_date — дата визита;
- visit_time — время визита;
- visit_number — порядковый номер визита клиента;
- utm_source — канал привлечения;
- utm_medium — тип привлечения;
- utm_campaign — рекламная кампания;
- utm_keyword — ключевое слово;
- device_category — тип устройства;
- device_os — ОС устройства;
- device_brand — марка устройства;
- device_model — модель устройства;
- device_screen_resolution — разрешение экрана;
- device_browser — браузер;
- geo_country — страна;
- geo_city — город

GA Hits:
###### Одна строка = одно событие в рамках одного визита на сайтv
- session_id — ID визита;
- hit_date — дата события;
- hit_time — время события;
- hit_number — порядковый номер события в рамках сессии;
- hit_type — тип события;
- hit_referer — источник события;
- hit_page_path — страница события;
- event_category — тип действия;
- event_action — действие;
- event_label — тег действия;
- event_value — значение результата действия

### Импорт сторонних библиотек

In [2]:
import re
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample

### Загрузка данных

In [3]:
com_df = pd.read_csv('data/df_with_target.csv')

In [4]:
com_df.head()

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
0,9055434745589932991.1637753792.1637753792,2108383000.0,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Zlatoust,0
1,905544597018549464.1636867290.1636867290,210838500.0,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,,385x854,Samsung Internet,Russia,Moscow,0
2,9055446045651783499.1640648526.1640648526,2108385000.0,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Krasnoyarsk,0
3,9055447046360770272.1622255328.1622255328,2108386000.0,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow,0
4,9055447046360770272.1622255345.1622255345,2108386000.0,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow,0


In [5]:
com_df.shape

(1732266, 19)

### Data Preparation

##### 1. Список колонок датасета

In [6]:
com_df.columns

Index(['session_id', 'client_id', 'visit_date', 'visit_time', 'visit_number',
       'utm_source', 'utm_medium', 'utm_campaign', 'utm_adcontent',
       'utm_keyword', 'device_category', 'device_os', 'device_brand',
       'device_model', 'device_screen_resolution', 'device_browser',
       'geo_country', 'geo_city', 'target_action'],
      dtype='object')

##### 2. Описательные статистики датасета

In [6]:
com_df.describe(include=[object])

Unnamed: 0,session_id,visit_date,visit_time,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city
count,1732266,1732266,1732266,1732190,1732266,1536979,1428129,711514,1732266,718302,1385070,15062,1732266,1732266,1732266,1732266
unique,1732266,226,85032,280,55,406,280,1192,3,13,200,104,4947,55,159,2389
top,9055434745589932991.1637753792.1637753792,2021-05-24,13:00:00,ZpYIoDJMcFzVoPFsHGJL,banner,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Apple,AuMdmADEIoPXiWpTsBEj,414x896,Chrome,Russia,Moscow
freq,1,39230,58044,552555,525206,422992,935408,465950,1368679,425520,503533,9018,155140,951573,1682423,750928


In [7]:
com_df.describe(exclude=[object])

Unnamed: 0,client_id,visit_number,target_action
count,1732266.0,1732266.0,1732266.0
mean,1074577000.0,2.627835,0.02904519
std,620137000.0,11.6426,0.1679333
min,232.164,1.0,0.0
25%,537894300.0,1.0,0.0
50%,1074614000.0,1.0,0.0
75%,1612130000.0,1.0,0.0
max,2147483000.0,564.0,1.0


In [7]:
com_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1732266 entries, 0 to 1732265
Data columns (total 19 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   session_id                object 
 1   client_id                 float64
 2   visit_date                object 
 3   visit_time                object 
 4   visit_number              int64  
 5   utm_source                object 
 6   utm_medium                object 
 7   utm_campaign              object 
 8   utm_adcontent             object 
 9   utm_keyword               object 
 10  device_category           object 
 11  device_os                 object 
 12  device_brand              object 
 13  device_model              object 
 14  device_screen_resolution  object 
 15  device_browser            object 
 16  geo_country               object 
 17  geo_city                  object 
 18  target_action             int64  
dtypes: float64(1), int64(2), object(16)
memory usage: 251.1+ MB


Проверка на дубликаты

In [9]:
com_df[com_df.duplicated()]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action




Список всех признаков с пропущенными значениями:

In [8]:
missing_values = ((com_df.isna().sum() / len(com_df)) * 100).sort_values(ascending=False)
print([column for column in missing_values[missing_values != 0].keys()])

['device_model', 'utm_keyword', 'device_os', 'device_brand', 'utm_adcontent', 'utm_campaign', 'utm_source']


In [27]:
utm_list = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_adcontent']  # , 'utm_keyword'
# new = com_df[utm_list].groupby(utm_list).agg({'utm_source':['count'], 'utm_medium': 'count'})
com_df['utm_campaign'].value_counts(dropna=False)

utm_campaign
LTuZkdKfxRGVceoWkVyg    422992
LEoPHuyFvzoNfnzGgfcd    321286
FTjNLDyTrXaWYgZymFkV    234950
NaN                     195287
gecBYcKZCPMcVYdSSzKP    133247
                         ...  
qPDTdivQVeflLjTYIJnG         1
InOGyxvxAfYvSHCpIjJZ         1
ehLonfPENrOEoPTIyiOZ         1
YlsczTIyBSwTLNtuDkCd         1
cXxuwXPoQCvAXPHpFcZl         1
Name: count, Length: 407, dtype: int64

In [28]:
device_list = ['device_category', 'device_os', 'device_brand']
com_df[device_list].value_counts(dropna=False)

device_category  device_os  device_brand
mobile           NaN        Apple           316778
desktop          NaN        NaN             236704
mobile           iOS        Apple           180902
                 NaN        Samsung         156249
                 Android    Samsung         150837
                                             ...  
desktop          NaN        Nokia                1
mobile           NaN        Advan                1
                            Wings Mobile         1
                 Android    Dark                 1
tablet           Android    Flexymove            1
Name: count, Length: 435, dtype: int64

Пропуски в device_model

In [11]:
print(f"Количество пропущенных значений: {com_df.device_model.isna().sum()}")
com_df[com_df['device_model'].isna()].head()

Количество пропущенных значений: 1717204


Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
0,9055434745589932991.1637753792.1637753792,2108383000.0,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Zlatoust,0
1,905544597018549464.1636867290.1636867290,210838500.0,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,,385x854,Samsung Internet,Russia,Moscow,0
2,9055446045651783499.1640648526.1640648526,2108385000.0,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Krasnoyarsk,0
3,9055447046360770272.1622255328.1622255328,2108386000.0,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow,0
4,9055447046360770272.1622255345.1622255345,2108386000.0,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,,mobile,,Xiaomi,,393x786,Chrome,Russia,Moscow,0


In [12]:
com_df['device_model'].value_counts(dropna=False)

device_model
NaN                     1717204
AuMdmADEIoPXiWpTsBEj       9018
tWBQlsvNfHxRUjaPAfhd        596
cwMJxNXiWUgMUxGiCTPs        586
pTgAEPipQxDXCjPrJbHo        430
                         ...   
OJiWyBKOyDITzXCZRSMH          1
MBGYWAQSYWUphNxTsAWD          1
XnjPzKjkHmznVfULanbE          1
VDidzTqFGxuqiRQJGrwB          1
qmRODeCJLlmkmwxNYXvp          1
Name: count, Length: 105, dtype: int64

In [13]:
com_df[
    (com_df['device_model'].notna()) & (com_df['device_brand'].notna())
    ]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
60,9055656589238457290.1635504877.1635504877,2.108434e+09,2021-10-29,13:54:37,1,ZpYIoDJMcFzVoPFsHGJL,banner,gecBYcKZCPMcVYdSSzKP,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Meizu,qBRdfuuhOnnqwSqNiPOv,360x744,Chrome,Russia,Moscow,0
63,905565998839028208.1635310063.1635310063,2.108435e+08,2021-10-27,07:47:43,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,OnePlus,AshpvdJdReafUzEMmWGr,384x832,Android Webview,Russia,Saint Petersburg,0
64,905565998839028208.1636951969.1636951969,2.108435e+08,2021-11-15,07:52:49,2,ISrKoXQCxqqYvAZICvjs,smm,,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,OnePlus,AshpvdJdReafUzEMmWGr,384x832,Android Webview,Russia,Saint Petersburg,0
157,9056202067269505745.1640092368.1640092368,2.108561e+09,2021-12-21,16:12:48,1,ZpYIoDJMcFzVoPFsHGJL,push,sbJRYgVfvcnqKJNDDYIr,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,384x854,Chrome,Russia,Moscow,0
199,9056422519349747445.1638964982.1638964982,2.108613e+09,2021-12-08,15:03:02,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,RrhnkuoaqckNtJpAZDzH,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,320x640,Chrome,Russia,Saint Petersburg,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1731612,9052658994125991465.1638271992.1638271992,2.107736e+09,2021-11-30,14:33:12,3,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,DBHgBJHOdbPwsRDUbEgX,mobile,Android,Vivo,cwMJxNXiWUgMUxGiCTPs,360x760,Chrome,Russia,Moscow,0
1731617,9052676375860711226.1639943995.1639943995,2.107740e+09,2021-12-19,22:59:55,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,393x873,Chrome,Russia,Izhevsk,0
1731659,9052904464392788371.1638799763.1638799763,2.107794e+09,2021-12-06,17:09:23,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Nokia,aCLaTVQlHcXbUnQYQvIg,412x892,Chrome,Russia,Novosibirsk,0
1731854,9053730012943175107.1640761072.1640761072,2.107986e+09,2021-12-29,09:57:52,2,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,(not set),AuMdmADEIoPXiWpTsBEj,360x760,Chrome,Russia,Vladivostok,0


В device_model слишком много пропущенных значений, решил просто удалить

In [14]:
# com_df.drop(columns=['device_model'], axis=1, inplace=True)
# com_df.head()

In [15]:
com_df['target_action'].value_counts()

target_action
0    1681952
1      50314
Name: count, dtype: int64

In [16]:
com_df['device_os'].value_counts(dropna=False)

device_os
NaN              1013964
Android           425520
iOS               182597
Windows            81740
Macintosh          23415
Linux               4616
(not set)            309
Chrome OS             65
BlackBerry            24
Tizen                  7
Firefox OS             3
Nokia                  2
Samsung                2
Windows Phone          2
Name: count, dtype: int64

In [17]:
com_df['device_brand'].value_counts(dropna=False)

device_brand
Apple       503533
NaN         347196
Samsung     311641
Xiaomi      269251
Huawei      173828
             ...  
Tesla            1
Vodafone         1
Wexler           1
Smarteo          1
Maxvi            1
Name: count, Length: 201, dtype: int64

In [18]:
com_df[com_df['device_brand'].isna()]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
28,9055505230298952295.1638478433.1638478433,2.108399e+09,2021-12-02,23:53:53,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,Windows,,,1536x864,Chrome,Russia,Balashikha,0
61,9055657327967035032.1629707931.1629707931,2.108435e+09,2021-08-23,11:00:00,1,nSReTmyFtbSjlPrTKoaX,banner,BVKxkCOHKUOvkpbrLMgZ,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,1920x1080,YaBrowser,Russia,Moscow,0
62,905565977351442956.1622561294.1622561294,2.108435e+08,2021-06-01,18:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,1920x1080,Chrome,Russia,Moscow,0
65,9055678214400253418.1636965866.1636965866,2.108439e+09,2021-11-15,11:44:26,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,Windows,,,1920x1080,Chrome,Russia,Saint Petersburg,0
71,9055788191321875859.1626006934.1626006934,2.108465e+09,2021-07-11,15:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,1280x720,Chrome,Russia,Moscow,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732250,9055354507009984602.1636829278.1636829278,2.108364e+09,2021-11-13,21:47:58,1,vFcAhRxLfOWKhvxjELkx,organic,okTXSMadDkjvntEHzIjp,LLfCasrxQzJIyuldcuWy,aXQzDWsJuGXeBXexNHjc,desktop,Windows,,,1366x768,Firefox,Russia,Balashikha,0
1732252,9055363711117247375.1629176721.1629176721,2.108366e+09,2021-08-17,08:00:00,1,PlbkrSYoHuZBWfYjYnfw,cpm,FTjNLDyTrXaWYgZymFkV,TuyPWsGQruPMpKvRxeBF,,desktop,,,,1920x1080,Chrome,Russia,(not set),0
1732254,9055376699099939975.1630766214.1630766214,2.108369e+09,2021-09-04,17:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,1920x1080,Chrome,Russia,Khimki,1
1732261,9055415581448263752.1640159305.1640159305,2.108378e+09,2021-12-22,10:48:25,1,BHcvLfOaCWvWTykYqHVe,cpc,,,VlqBmecIOXWjCWUmQkLd,desktop,Windows,,,1920x1080,Chrome,Russia,Moscow,0


In [19]:
devices_xiaomi_notnan = com_df[(com_df['device_brand'] == 'Xiaomi') & (com_df['device_category'] == 'mobile') & (com_df['device_os'].notna())]
devices_xiaomi_notnan

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
31,9055507467976770564.1638335492.1638335492,2.108400e+09,2021-12-01,08:11:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,393x851,Chrome,Russia,Saint Petersburg,0
41,9055541209241593296.1640080849.1640080849,2.108407e+09,2021-12-21,13:00:49,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,393x851,Android Webview,Russia,Novorossiysk,0
58,905565212864092591.1639650867.1639650867,2.108433e+08,2021-12-16,13:34:27,2,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,393x851,Chrome,Russia,Tula,0
59,905565212864092591.1639652819.1639652819,2.108433e+08,2021-12-16,14:06:59,3,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,393x851,Chrome,Russia,Tula,0
68,9055768546153401681.1637944657.1637944657,2.108460e+09,2021-11-26,19:37:37,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,393x851,Chrome,Russia,Yekaterinburg,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732232,9055282415982183461.1635091492.1635091492,2.108347e+09,2021-10-24,19:04:52,1,kjsLglQLzykiRbcDiGcD,cpc,bJJuEXRheRIxXEaYIXqM,,KCcEkEaKEtUilBVMoCAi,mobile,Android,Xiaomi,,375x833,Chrome,Russia,Saint Petersburg,0
1732237,9055315461464429378.1638961956.1638961956,2.108355e+09,2021-12-08,14:12:36,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,360x800,Chrome,Russia,Vladivostok,0
1732248,9055349000865826584.1640744740.1640744740,2.108363e+09,2021-12-29,05:25:40,1,TxKUcPpthBDPieTGmVhx,cpc,FTjNLDyTrXaWYgZymFkV,LcGIUNPUAmXtQJaDfFBR,NnplfljjtYPiMnRvogpA,mobile,Android,Xiaomi,,464x1123,Chrome,Russia,Moscow,0
1732251,9055355469082180480.1636350848.1636350848,2.108364e+09,2021-11-08,08:54:08,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,mobile,Android,Xiaomi,,393x873,Chrome,Russia,Moscow,0


In [20]:
com_df.loc[(com_df['device_brand'] == 'Xiaomi') & (com_df['device_os'].isna()  & (com_df['device_category'] == 'mobile')), 'device_os'] = 'Android'

In [21]:
com_df.loc[(com_df['device_brand'] == 'Xiaomi') & (com_df['device_os'].isna()  & (com_df['device_category'] == 'tablet')), 'device_os'] = 'Android'

In [22]:
xiaomi_na = com_df[(com_df['device_brand'] == 'Xiaomi') & (com_df['device_os'].isna())]
xiaomi_na

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
121635,1338874692298414890.1630870317.1630870317,311731100.0,2021-09-05,22:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,XHNUiSKKGTEpSAlaVMFQ,,,desktop,,Xiaomi,,360x760,Chrome,Russia,Novorossiysk,0
296343,2179795865415458772.1626611671.1626611671,507523300.0,2021-07-18,15:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,XHNUiSKKGTEpSAlaVMFQ,,,desktop,,Xiaomi,,360x720,Opera,Russia,Saratov,0
893512,5039119271279276996.1634546629.1634546629,1173261000.0,2021-10-18,11:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,XHNUiSKKGTEpSAlaVMFQ,JNHcPlZPxEMWDnRiyoBf,,desktop,,Xiaomi,,393x873,Chrome,Russia,Moscow,0
893513,5039119271279276996.1634891966.1634891966,1173261000.0,2021-10-22,11:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,XHNUiSKKGTEpSAlaVMFQ,JNHcPlZPxEMWDnRiyoBf,,desktop,,Xiaomi,,393x873,Chrome,Russia,Moscow,0


In [23]:
devices_huawei_notnan = com_df[(com_df['device_brand'] == 'Huawei') & (com_df['device_os'].isna())]
devices_huawei_notnan

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
46,9055578871792078746.1622347676.1622347676,2.108416e+09,2021-05-30,07:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,XKsYZiUFcdkUXQpoLKyS,,mobile,,Huawei,,360x780,Opera,Russia,Saint Petersburg,0
69,9055768928391199041.1623652678.1623652678,2.108461e+09,2021-06-14,09:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,,mobile,,Huawei,,360x780,Chrome,Russia,Moscow,1
117,9055964456785228161.1631531397.1631531596,2.108506e+09,2021-09-13,14:00:00,1,bByPQxmDaMXgpHeypKSM,referral,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Huawei,,360x780,Chrome,Russia,Saint Petersburg,0
125,90560185929716668.1629639613.1629639679,2.108519e+07,2021-08-22,16:00:00,1,bByPQxmDaMXgpHeypKSM,referral,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Huawei,,360x800,Opera,Russia,Moscow,0
129,9056044781265769926.1633703370.1633703370,2.108525e+09,2021-10-08,17:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,SgIUDYUKnyWHVowUOqid,JNHcPlZPxEMWDnRiyoBf,,mobile,,Huawei,,360x770,YaBrowser,Russia,Moscow,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732226,9055248417020050011.1634075504.1634075504,2.108339e+09,2021-10-13,00:00:00,2,ZpYIoDJMcFzVoPFsHGJL,banner,gecBYcKZCPMcVYdSSzKP,JNHcPlZPxEMWDnRiyoBf,,mobile,,Huawei,,360x780,Chrome,Russia,Nizhny Novgorod,0
1732227,9055248417020050011.1634075541.1634075541,2.108339e+09,2021-10-13,00:00:00,3,ZpYIoDJMcFzVoPFsHGJL,banner,SgIUDYUKnyWHVowUOqid,JNHcPlZPxEMWDnRiyoBf,,mobile,,Huawei,,360x780,Chrome,Russia,Nizhny Novgorod,1
1732228,9055248417020050011.1634078104.1634078104,2.108339e+09,2021-10-13,01:00:00,4,ZpYIoDJMcFzVoPFsHGJL,banner,SgIUDYUKnyWHVowUOqid,JNHcPlZPxEMWDnRiyoBf,,mobile,,Huawei,,360x780,Chrome,Russia,Nizhny Novgorod,0
1732229,9055248417020050011.1634078414.1634078414,2.108339e+09,2021-10-13,01:00:00,5,ZpYIoDJMcFzVoPFsHGJL,banner,gecBYcKZCPMcVYdSSzKP,JNHcPlZPxEMWDnRiyoBf,,mobile,,Huawei,,360x780,Chrome,Russia,Nizhny Novgorod,0


In [24]:
com_df.loc[(com_df['device_brand'] == 'Huawei') & (com_df['device_category'] == 'desktop'), 'device_category'] = 'mobile'

In [25]:
com_df.loc[(com_df['device_brand'] == 'Huawei') & (com_df['device_os'].isna()) & (com_df['device_category'] == 'mobile'), 'device_os'] = 'Android'

In [26]:
com_df.loc[(com_df['device_brand'] == 'Huawei') & (com_df['device_os'].isna()) & (com_df['device_category'] == 'tablet'), 'device_os'] = 'Android'

In [27]:
samsung_na = com_df[(com_df['device_brand'] == 'Samsung') & (com_df['device_os'].isna())]
samsung_na

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
7,9055461992850812764.1626107740.1626107740,2.108389e+09,2021-07-12,19:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,,mobile,,Samsung,,360x640,Chrome,Russia,Saint Petersburg,0
9,9055466554104774132.1624800757.1624800757,2.108390e+09,2021-06-27,16:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,,mobile,,Samsung,,412x915,Chrome,Russia,Moscow,0
10,9055466554104774132.1629442326.1629442326,2.108390e+09,2021-08-20,09:00:00,2,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,,mobile,,Samsung,,412x915,Chrome,Russia,Moscow,0
11,9055469620715506713.1628883994.1628883994,2.108391e+09,2021-08-13,22:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Samsung,,412x869,Android Webview,Russia,Saint Petersburg,0
12,9055469620715506713.1633110583.1633110583,2.108391e+09,2021-10-01,20:00:00,2,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Samsung,,412x869,Android Webview,Russia,Saint Petersburg,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732157,905495157639567035.1626429117.1626429117,2.108270e+08,2021-07-16,12:00:00,1,BHcvLfOaCWvWTykYqHVe,cpc,,,,mobile,,Samsung,,412x915,Chrome,Russia,Saint Petersburg,0
1732158,905495157639567035.1626429718.1626429718,2.108270e+08,2021-07-16,13:00:00,2,BHcvLfOaCWvWTykYqHVe,cpc,,,,mobile,,Samsung,,412x915,Chrome,Russia,Saint Petersburg,0
1732203,9055165528442559324.1630427998.1630427998,2.108320e+09,2021-08-31,19:00:00,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,PkybGvWbaqORmxjNunqZ,,mobile,,Samsung,,412x846,Chrome,Russia,Saint Petersburg,0
1732257,9055394269810294140.1629912447.1629912447,2.108373e+09,2021-08-25,20:00:00,1,bByPQxmDaMXgpHeypKSM,referral,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Samsung,,360x800,Android Webview,Russia,Saint Petersburg,1


In [28]:
com_df.loc[(com_df['device_brand'] == 'Samsung') & (com_df['device_category'] == 'desktop'), 'device_category'] = 'mobile'

In [29]:
com_df.loc[(com_df['device_brand'] == 'Samsung') & (com_df['device_os'].isna()) & (com_df['device_category'] == 'mobile'), 'device_os'] = 'Android'

In [30]:
com_df.loc[(com_df['device_brand'] == 'Samsung') & (com_df['device_os'].isna()) & (com_df['device_category'] == 'tablet'), 'device_os'] = 'Android'

In [31]:
com_df[(com_df['device_brand'].notna()) & (com_df['device_os'].isna())]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
5,9055447192389856083.1622453074.1622453074,2.108386e+09,2021-05-31,12:00:00,1,kjsLglQLzykiRbcDiGcD,organic,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Apple,,375x812,Safari,Russia,Saint Petersburg,0
32,9055511191703531814.1628451110.1628451110,2.108400e+09,2021-08-08,22:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,tablet,,Apple,,375x667,Safari (in-app),Russia,Moscow,0
33,9055511191703531814.1628451239.1628451239,2.108400e+09,2021-08-08,22:00:00,2,klTrhUaShgnjIbaPmqjc,referral,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,tablet,,Apple,,375x667,Safari (in-app),Russia,Moscow,0
34,90555135045918912.1627381953.1627381953,2.108401e+07,2021-07-27,13:00:00,1,ghoaGAksqhKomdFrxgyJ,cpm,FTjNLDyTrXaWYgZymFkV,NhvfEqcSTGEZKxxvUZlj,,mobile,,Apple,,375x812,Safari (in-app),Russia,Vladivostok,0
44,9055561545897976496.1626317488.1626317488,2.108412e+09,2021-07-15,05:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,MXqmDyetMTICSSitTjWV,JNHcPlZPxEMWDnRiyoBf,,mobile,,Apple,,428x926,Safari,United States,Sunny Isles Beach,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732241,905532652706692816.1629060816.1629060816,2.108358e+08,2021-08-15,23:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Apple,,428x926,Safari (in-app),Russia,Moscow,0
1732242,905532652706692816.1629091708.1629091708,2.108358e+08,2021-08-16,08:00:00,2,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,Apple,,428x926,Safari (in-app),Russia,Moscow,0
1732249,9055349030922605117.1632752193.1632752193,2.108363e+09,2021-09-27,17:00:00,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,PkybGvWbaqORmxjNunqZ,,mobile,,BQ,,640x360,YaBrowser,Russia,Saint Petersburg,0
1732259,9055397194683347295.1630237022.1630237022,2.108374e+09,2021-08-29,14:00:00,1,ISrKoXQCxqqYvAZICvjs,blogger_stories,zfwIehuEfWYdYrEZgRLo,JNHcPlZPxEMWDnRiyoBf,,mobile,,Apple,,414x896,Safari,Russia,Zheleznodorozhny,0


In [32]:
com_df.loc[(com_df['device_brand'] == 'Apple') & (com_df['device_category'].isin(['mobile', 'tablet'])) & (com_df['device_os'].isna()), 'device_os'] = 'iOS'

In [33]:
com_df.loc[(com_df['device_brand'] == 'Apple') & (com_df['device_category'] == 'desktop') & (com_df['device_os'].isna()), 'device_os'] = 'Macintosh'

In [34]:
com_df['device_os'].value_counts(dropna=False)

device_os
Android          816795
iOS              503499
NaN              301748
Windows           81740
Macintosh         23454
Linux              4616
(not set)           309
Chrome OS            65
BlackBerry           24
Tizen                 7
Firefox OS            3
Nokia                 2
Samsung               2
Windows Phone         2
Name: count, dtype: int64

In [35]:
com_df['device_brand'].value_counts(dropna=False)

device_brand
Apple       503533
NaN         347196
Samsung     311641
Xiaomi      269251
Huawei      173828
             ...  
Tesla            1
Vodafone         1
Wexler           1
Smarteo          1
Maxvi            1
Name: count, Length: 201, dtype: int64

In [36]:
com_df[(com_df['device_brand'].isna()) & (com_df['device_os'] == 'Windows')]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
28,9055505230298952295.1638478433.1638478433,2.108399e+09,2021-12-02,23:53:53,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,Windows,,,1536x864,Chrome,Russia,Balashikha,0
65,9055678214400253418.1636965866.1636965866,2.108439e+09,2021-11-15,11:44:26,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,Windows,,,1920x1080,Chrome,Russia,Saint Petersburg,0
77,9055795561498027164.1638278300.1638278300,2.108467e+09,2021-11-30,16:18:20,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,jpZoxCaowxXvglZVUJyq,desktop,Windows,,,1680x1050,Chrome,Russia,Moscow,0
86,9055853019573035768.1640801016.1640801016,2.108480e+09,2021-12-29,21:03:36,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,Windows,,,1024x820,YaBrowser,Russia,Moscow,0
96,9055873197324118026.1635526668.1635526668,2.108485e+09,2021-10-29,19:57:48,1,jaSOmLICuBzCFqHfBdRg,email,bxOTvPtyGSdUrbwoXCPO,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,Windows,,,1920x1080,YaBrowser,Russia,Moscow,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732210,9055207816701080345.1640952603.1640952603,2.108330e+09,2021-12-31,15:10:03,1,nSReTmyFtbSjlPrTKoaX,banner,BVKxkCOHKUOvkpbrLMgZ,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,Windows,,,1280x1024,Opera,Russia,Samara,0
1732224,9055247549438461124.1635878086.1635878086,2.108339e+09,2021-11-02,21:34:46,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,Windows,,,1366x768,Chrome,Russia,Kazan,0
1732233,9055283330814105899.1638979884.1638979890,2.108347e+09,2021-12-08,19:11:30,1,bByPQxmDaMXgpHeypKSM,referral,QdLfySaGXolfTBSNVfHn,SOkCdPxfUcZUzzOdgGES,puhZPIYqKXeFPaUviSjo,desktop,Windows,,,1920x1080,Chrome,Russia,Krasnodar,0
1732250,9055354507009984602.1636829278.1636829278,2.108364e+09,2021-11-13,21:47:58,1,vFcAhRxLfOWKhvxjELkx,organic,okTXSMadDkjvntEHzIjp,LLfCasrxQzJIyuldcuWy,aXQzDWsJuGXeBXexNHjc,desktop,Windows,,,1366x768,Firefox,Russia,Balashikha,0


In [37]:
com_df.loc[(com_df['device_brand'].isna()) & (com_df['device_os'] == 'Windows'), 'device_brand'] = 'other_brand'

In [38]:
com_df[(com_df['device_brand'].isna()) & (com_df['device_os'].isna()) & (com_df['device_category'] == 'desktop')]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
61,9055657327967035032.1629707931.1629707931,2.108435e+09,2021-08-23,11:00:00,1,nSReTmyFtbSjlPrTKoaX,banner,BVKxkCOHKUOvkpbrLMgZ,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,1920x1080,YaBrowser,Russia,Moscow,0
62,905565977351442956.1622561294.1622561294,2.108435e+08,2021-06-01,18:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,1920x1080,Chrome,Russia,Moscow,0
71,9055788191321875859.1626006934.1626006934,2.108465e+09,2021-07-11,15:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,1280x720,Chrome,Russia,Moscow,0
72,9055788191321875859.1627377569.1627377569,2.108465e+09,2021-07-27,12:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,nSReTmyFtbSjlPrTKoaX,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,1920x1080,Chrome,Russia,Moscow,0
76,9055794766915104430.1624305327.1624305327,2.108467e+09,2021-06-21,22:00:00,1,BHcvLfOaCWvWTykYqHVe,cpc,,,,desktop,,,,834x1112,Safari,Russia,Domodedovo,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732245,9055329372850890545.1629287172.1629287172,2.108358e+09,2021-08-18,14:00:00,3,BHcvLfOaCWvWTykYqHVe,cpc,,,,desktop,,,,1920x1080,YaBrowser,Russia,Moscow,0
1732246,9055343778173115129.1628265209.1628265209,2.108362e+09,2021-08-06,18:00:00,1,kjsLglQLzykiRbcDiGcD,organic,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,1920x1080,Firefox,Russia,Moscow,0
1732247,9055345397369530035.1622009529.1622009529,2.108362e+09,2021-05-26,09:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,,,,1536x864,Firefox,Russia,(not set),0
1732252,9055363711117247375.1629176721.1629176721,2.108366e+09,2021-08-17,08:00:00,1,PlbkrSYoHuZBWfYjYnfw,cpm,FTjNLDyTrXaWYgZymFkV,TuyPWsGQruPMpKvRxeBF,,desktop,,,,1920x1080,Chrome,Russia,(not set),0


In [39]:
com_df.loc[(com_df['device_brand'].isna()) & (com_df['device_os'].isna()) & (com_df['device_category'] == 'desktop'), 'device_os'] = 'Windows'

In [40]:
com_df[(com_df['device_brand'].isna()) & (com_df['device_os'].notna())]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
61,9055657327967035032.1629707931.1629707931,2.108435e+09,2021-08-23,11:00:00,1,nSReTmyFtbSjlPrTKoaX,banner,BVKxkCOHKUOvkpbrLMgZ,JNHcPlZPxEMWDnRiyoBf,,desktop,Windows,,,1920x1080,YaBrowser,Russia,Moscow,0
62,905565977351442956.1622561294.1622561294,2.108435e+08,2021-06-01,18:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,Windows,,,1920x1080,Chrome,Russia,Moscow,0
71,9055788191321875859.1626006934.1626006934,2.108465e+09,2021-07-11,15:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,Windows,,,1280x720,Chrome,Russia,Moscow,0
72,9055788191321875859.1627377569.1627377569,2.108465e+09,2021-07-27,12:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,nSReTmyFtbSjlPrTKoaX,JNHcPlZPxEMWDnRiyoBf,,desktop,Windows,,,1920x1080,Chrome,Russia,Moscow,0
76,9055794766915104430.1624305327.1624305327,2.108467e+09,2021-06-21,22:00:00,1,BHcvLfOaCWvWTykYqHVe,cpc,,,,desktop,Windows,,,834x1112,Safari,Russia,Domodedovo,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732246,9055343778173115129.1628265209.1628265209,2.108362e+09,2021-08-06,18:00:00,1,kjsLglQLzykiRbcDiGcD,organic,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,Windows,,,1920x1080,Firefox,Russia,Moscow,0
1732247,9055345397369530035.1622009529.1622009529,2.108362e+09,2021-05-26,09:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,Windows,,,1536x864,Firefox,Russia,(not set),0
1732252,9055363711117247375.1629176721.1629176721,2.108366e+09,2021-08-17,08:00:00,1,PlbkrSYoHuZBWfYjYnfw,cpm,FTjNLDyTrXaWYgZymFkV,TuyPWsGQruPMpKvRxeBF,,desktop,Windows,,,1920x1080,Chrome,Russia,(not set),0
1732254,9055376699099939975.1630766214.1630766214,2.108369e+09,2021-09-04,17:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,desktop,Windows,,,1920x1080,Chrome,Russia,Khimki,1


In [41]:
com_df.loc[(com_df['device_brand'].isna()) & (com_df['device_os'] == 'Macintosh'), 'device_brand'] = 'Apple'

In [42]:
com_df.loc[(com_df['device_brand'].isna()) & (com_df['device_os'] == '(not set)') & (com_df['device_category'] == 'desktop'), 'device_brand'] = 'other_brand'

In [43]:
com_df.loc[(com_df['device_brand'].isna()) & (com_df['device_os'] == 'Chrome OS') & (com_df['device_category'] == 'desktop'), 'device_brand'] = 'other_brand'

In [44]:
com_df[(com_df['device_brand'].notna()) & (com_df['device_os'].isna())]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
52,9055627555247652641.1623621462.1623621462,2.108428e+09,2021-06-14,00:00:00,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,JNHcPlZPxEMWDnRiyoBf,,mobile,,Vivo,,393x873,Chrome,Russia,Kazan,0
80,9055834796008808950.1622811129.1622811129,2.108476e+09,2021-06-04,15:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,(not set),,360x640,Chrome,Russia,Chelyabinsk,0
83,9055848638691110934.1625518111.1625518111,2.108479e+09,2021-07-05,23:00:00,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,dUuXlWzvmhDSyclWRhNP,,mobile,,BQ,,360x760,Chrome,Russia,Saint Petersburg,0
122,9056013556852379550.1632554912.1632554912,2.108517e+09,2021-09-25,10:00:00,1,RmEBuqrriAfAVsLQQmhk,cpc,VBmazutCflYumtDHrQYe,JNHcPlZPxEMWDnRiyoBf,,mobile,,(not set),,342x741,Firefox,Russia,Kazan,0
165,905625488421903154.1626168114.1626168114,2.108574e+08,2021-07-13,12:00:00,1,fDLlAcSmythWSCVMvqvL,(none),LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,,mobile,,OnePlus,,385x833,Chrome,Russia,Moscow,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732183,9055095340088981199.1632401048.1632401048,2.108304e+09,2021-09-23,15:00:00,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,PkybGvWbaqORmxjNunqZ,,mobile,,Lenovo,,347x791,Chrome,Russia,Saint Petersburg,0
1732211,9055214413762615190.1632720617.1632720617,2.108331e+09,2021-09-27,08:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,LwJZxKuWmvOhPsaCeRjG,,,mobile,,Vivo,,393x876,Chrome,Russia,Khimki,0
1732212,9055214413762615190.1632723136.1632723136,2.108331e+09,2021-09-27,09:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,LwJZxKuWmvOhPsaCeRjG,,,mobile,,Vivo,,393x876,Chrome,Russia,Khimki,0
1732216,9055219700864907993.1630272218.1630272218,2.108333e+09,2021-08-30,00:00:00,1,ISrKoXQCxqqYvAZICvjs,post,ESphyUeLTPINiYALHWrO,JNHcPlZPxEMWDnRiyoBf,,mobile,,Sony,,360x640,Android Webview,Russia,Moscow,0


In [45]:
com_df.loc[(com_df['device_brand'].notna()) & (com_df['device_os'] == '(not set)') & (com_df['device_category'] == 'mobile'), 'device_os'] = 'Android'

In [46]:
com_df.loc[(com_df['device_brand'].notna()) & (com_df['device_os'].isna()) & (com_df['device_category'] == 'mobile'), 'device_os'] = 'Android'

In [47]:
com_df.loc[(com_df['device_brand'].notna()) & (com_df['device_os'].isna()) & (com_df['device_category'] == 'tablet'), 'device_os'] = 'Android'

In [48]:
com_df.loc[(com_df['device_brand'].notna()) & (com_df['device_os'].isna()) & (com_df['device_category'] == 'desktop'), 'device_os'] = 'Windows'

In [49]:
notset_data = com_df[(com_df['device_brand'].isna()) & (com_df['device_os'].isna())]
len(notset_data)

354

Удалил где бренд и ос пустые

In [50]:
# com_df = com_df[(com_df['device_brand'].notna()) & (com_df['device_os'].notna())]

In [51]:
com_df[(com_df['device_brand'].notna()) & (com_df['device_os'].isna())]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action


In [52]:
com_df.loc[(com_df['device_brand'].notna()) & (com_df['device_os'].isna()), 'device_os'] = 'Android'

In [53]:
com_df['device_os'].value_counts(dropna=False)

device_os
Android          881466
iOS              503499
Windows          318470
Macintosh         23454
Linux              4616
NaN                 354
(not set)           302
Chrome OS            65
BlackBerry           24
Tizen                 7
Firefox OS            3
Nokia                 2
Samsung               2
Windows Phone         2
Name: count, dtype: int64

In [54]:
com_df['device_brand'].value_counts(dropna=False)

device_brand
Apple       526948
Samsung     311641
Xiaomi      269251
NaN         241674
Huawei      173828
             ...  
Tesla            1
Vodafone         1
Wexler           1
Smarteo          1
Maxvi            1
Name: count, Length: 202, dtype: int64

In [57]:
basic_os = [
    'Android',
    'iOS',
    'Windows',
    'Macintosh',
    'Linux',
    'other_os'
]

other_os_list = [i_os for i_os in com_df['device_os'].values if i_os not in basic_os]
other_os_list
com_df['device_os'] = com_df['device_os'].replace(other_os_list, 'other_os')
com_df['device_os'].value_counts(dropna=False)

device_os
Android      881466
iOS          503499
Windows      318470
Macintosh     23454
Linux          4616
other_os        761
Name: count, dtype: int64

device_os
Android      881466
iOS          503499
Windows      318470
Macintosh     23454
Linux          4616
other_os        761
Name: count, dtype: int64

In [56]:
apple_desktop = com_df[(com_df['device_os'] == 'Macintosh')]
apple_desktop

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
215,905648595358313255.1638525736.1638525736,2.108627e+08,2021-12-03,13:02:16,1,kjsLglQLzykiRbcDiGcD,cpc,XHNUiSKKGTEpSAlaVMFQ,JNHcPlZPxEMWDnRiyoBf,nSReTmyFtbSjlPrTKoaX,desktop,Macintosh,Apple,,1440x900,Safari,Russia,Moscow,0
216,905648595358313255.1638525739.1638525739,2.108627e+08,2021-12-03,13:02:19,2,kjsLglQLzykiRbcDiGcD,cpc,,,nSReTmyFtbSjlPrTKoaX,desktop,Macintosh,Apple,,1440x900,Safari,Russia,Moscow,0
469,9057620093667786277.1635925543.1635925543,2.108892e+09,2021-11-03,10:45:43,1,kjsLglQLzykiRbcDiGcD,cpc,RoDitORHdzGfGhNCyEMy,,QRSrGMRlRfLHqzjtnKsP,desktop,Macintosh,Apple,,1440x900,Chrome,Russia,(not set),1
470,9057620093667786277.1635960751.1635960751,2.108892e+09,2021-11-03,20:32:31,2,kjsLglQLzykiRbcDiGcD,cpc,RoDitORHdzGfGhNCyEMy,,QRSrGMRlRfLHqzjtnKsP,desktop,Macintosh,Apple,,1440x900,Chrome,Russia,(not set),0
471,9057620093667786277.1636022422.1636022422,2.108892e+09,2021-11-04,13:40:22,3,kjsLglQLzykiRbcDiGcD,cpc,RoDitORHdzGfGhNCyEMy,,QRSrGMRlRfLHqzjtnKsP,desktop,Macintosh,Apple,,1440x900,Chrome,Russia,(not set),0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1731890,9053933615865089375.1637567841.1637567841,2.108033e+09,2021-11-22,10:57:21,1,vFcAhRxLfOWKhvxjELkx,organic,okTXSMadDkjvntEHzIjp,LLfCasrxQzJIyuldcuWy,aXQzDWsJuGXeBXexNHjc,desktop,Macintosh,Apple,,1920x1080,Chrome,Russia,Saint Petersburg,0
1731930,9054082268976121089.1635517699.1635517699,2.108068e+09,2021-10-29,17:28:19,1,jaSOmLICuBzCFqHfBdRg,email,bxOTvPtyGSdUrbwoXCPO,JNHcPlZPxEMWDnRiyoBf,puhZPIYqKXeFPaUviSjo,desktop,Macintosh,Apple,,1280x800,Chrome,Russia,Voronezh,0
1731976,9054215168154192730.1640549210.1640549217,2.108099e+09,2021-12-26,23:06:57,1,bByPQxmDaMXgpHeypKSM,referral,LTuZkdKfxRGVceoWkVyg,JNHcPlZPxEMWDnRiyoBf,UKNBOHebRIIsQRsjNKay,desktop,Macintosh,Apple,,1792x1120,Safari,Russia,Kursk,0
1732043,9054511349090867625.1632491945.1632491945,2.108168e+09,2021-09-24,16:59:05,1,jaSOmLICuBzCFqHfBdRg,email,YCKgTzTDywjcWyQudGch,nNqUcgFgcqQbTVSvgaHr,puhZPIYqKXeFPaUviSjo,desktop,Macintosh,Apple,,1792x1120,Safari,Russia,Moscow,0


In [58]:
com_df['device_brand'].value_counts(dropna=False).head(10)

device_brand
Apple          526948
Samsung        311641
Xiaomi         269251
NaN            241674
Huawei         173828
other_brand     82107
Realme          17926
(not set)       16392
OPPO            12775
Vivo            11026
Name: count, dtype: int64

In [59]:
com_df.loc[(com_df['device_brand'].isna()), 'device_brand'] = 'other_brand'

In [60]:
basic_brands = [
    'Apple',
    'Samsung',
    'Xiaomi',
    'Huawei',
    'Realme',
    'OPPO',
    'Vivo'
    'other_brand',
]

other_brands = [brand for brand in com_df['device_brand'].values if brand not in basic_brands]

com_df['device_brand'] = com_df['device_brand'].replace(list(set(other_brands)), 'other_brand')
com_df['device_brand'].value_counts(dropna=False)

device_brand
Apple          526948
other_brand    419897
Samsung        311641
Xiaomi         269251
Huawei         173828
Realme          17926
OPPO            12775
Name: count, dtype: int64

In [61]:
com_df['device_screen_resolution'].value_counts(dropna=False)

device_screen_resolution
414x896      155140
1920x1080    119214
393x851      107972
375x812      106961
360x780       86718
              ...  
620x1090          1
514x1129          1
496x600           1
841x421           1
464x1123          1
Name: count, Length: 4947, dtype: int64

In [62]:
# com_df['screen_square'] = com_df['device_screen_resolution'].apply(lambda x: int(x.split('x')[0]) * int(x.split('x')[1]))
# com_df['screen_square']

0           259200
1           328790
2           259200
3           308898
4           308898
            ...   
1732261    2073600
1732262     329160
1732263     250125
1732264     334443
1732265    1049088
Name: screen_square, Length: 1732266, dtype: int64

Изученине utm_

In [63]:
com_df['utm_source'].value_counts(dropna=False)

utm_source
ZpYIoDJMcFzVoPFsHGJL    552555
fDLlAcSmythWSCVMvqvL    277060
kjsLglQLzykiRbcDiGcD    245178
MvfHsxITijuriZxsqZqt    175831
BHcvLfOaCWvWTykYqHVe    110963
                         ...  
WRHPhoRKhKxaenRCLBfo         1
tjKUGseGMhkLEzdCqBRx         1
VdeFdoGCqZBxFfKSHNLl         1
RzLAoRYmCtVATSoPvWAQ         1
nVVduuqoxTOGBvvhENWW         1
Name: count, Length: 281, dtype: int64

In [64]:
other_source = [source[0] for source in com_df['utm_source'].value_counts(dropna=False).items() if source[1] < 10**3 or source[0] == np.nan]

com_df['utm_source'] = com_df['utm_source'].replace(list(set(other_source)), 'other_source')

com_df['utm_source'].value_counts(dropna=False)

utm_source
ZpYIoDJMcFzVoPFsHGJL    552555
fDLlAcSmythWSCVMvqvL    277060
kjsLglQLzykiRbcDiGcD    245178
MvfHsxITijuriZxsqZqt    175831
BHcvLfOaCWvWTykYqHVe    110963
bByPQxmDaMXgpHeypKSM     90356
QxAxdyPLuQMEcrdZWdWb     45267
aXQzDWsJuGXeBXexNHjc     29528
jaSOmLICuBzCFqHfBdRg     28288
RmEBuqrriAfAVsLQQmhk     27412
PlbkrSYoHuZBWfYjYnfw     20362
vFcAhRxLfOWKhvxjELkx     18320
hTjLvqNxGggkGnxSCaTm     14396
other_source             14357
gDBGzjFKYabGgSPZvrDH     13331
fgymSoTvjKPEgaIJqsiH      9557
geDcueAOghDzHkGMmdOq      8180
ISrKoXQCxqqYvAZICvjs      7839
nSReTmyFtbSjlPrTKoaX      5815
eLzNJHzPelJpEyBwMrKo      4578
IZEXUFLARCUMynmHNBGo      4248
iNFgfQPqHPBuvGCYtrQE      3222
gVRrcxiDQubJiljoTbGm      2733
SzZERoLMmrEUEhDaYcyN      2648
oZCzWSykfixnjMPDNjSU      2585
nmfptFmSirEqNzAzqbXA      2197
GpAkIXsclxDGyILfNlrR      2071
TxKUcPpthBDPieTGmVhx      2024
ghoaGAksqhKomdFrxgyJ      1859
KgicpPxiEQfzPlPwQZJq      1674
nrKihqcWGIzDsOqljdAv      1514
DnEUulZAecfGPvdtZBYS      13

In [65]:
com_df['utm_medium'].value_counts(dropna=False)

utm_medium
banner               525206
cpc                  399395
(none)               277060
cpm                  229791
referral             136851
organic               55009
email                 28287
push                  27710
stories               10472
cpv                    7815
blogger_channel        7731
smartbanner            6501
blogger_stories        4226
tg                     3920
cpa                    3266
post                   2227
outlook                1269
app                    1211
smm                    1194
clicks                  908
blogger_header          760
(not set)               405
info_text               335
sms                     194
landing                 127
partner                  95
link                     52
cbaafe                   43
CPM                      36
yandex_cpc               31
vk_smm                   25
static                   17
google_cpc               15
article                  15
web_polka                11
fb_smm   

In [66]:

other_medium = [medium[0] for medium in com_df['utm_medium'].value_counts(dropna=False).items() if medium[1] < 10**3 or medium[0] == '(none)']

com_df['utm_medium'] = com_df['utm_medium'].replace(list(set(other_medium)), 'other_medium')

com_df['utm_medium'].value_counts(dropna=False)
    

utm_medium
banner             525206
cpc                399395
other_medium       280185
cpm                229791
referral           136851
organic             55009
email               28287
push                27710
stories             10472
cpv                  7815
blogger_channel      7731
smartbanner          6501
blogger_stories      4226
tg                   3920
cpa                  3266
post                 2227
outlook              1269
app                  1211
smm                  1194
Name: count, dtype: int64

In [67]:
com_df.head()

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action,screen_square
0,9055434745589932991.1637753792.1637753792,2108383000.0,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Zlatoust,0,259200
1,905544597018549464.1636867290.1636867290,210838500.0,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,,385x854,Samsung Internet,Russia,Moscow,0,328790
2,9055446045651783499.1640648526.1640648526,2108385000.0,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Krasnoyarsk,0,259200
3,9055447046360770272.1622255328.1622255328,2108386000.0,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,,mobile,Android,Xiaomi,,393x786,Chrome,Russia,Moscow,0,308898
4,9055447046360770272.1622255345.1622255345,2108386000.0,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,,mobile,Android,Xiaomi,,393x786,Chrome,Russia,Moscow,0,308898


In [68]:
com_df['geo_city'].value_counts(dropna=False).head(20)

geo_city
Moscow              750928
Saint Petersburg    278402
(not set)            73297
Yekaterinburg        33555
Krasnodar            30260
Kazan                27689
Samara               23433
Nizhny Novgorod      20782
Ufa                  20283
Novosibirsk          20115
Krasnoyarsk          15283
Chelyabinsk          14923
Tula                 14774
Rostov-on-Don        13064
Voronezh             12701
Irkutsk              12596
Grozny               11949
Balashikha           11868
Vladivostok          11514
Yaroslavl             9143
Name: count, dtype: int64

In [69]:
other_city = [city[0] for city in com_df['geo_city'].value_counts(dropna=False).items() if city[1] < 10**3 or city[0] == '(not set)']

com_df['geo_city'] = com_df['geo_city'].replace(list(set(other_city)), 'other_city')

com_df['geo_city'].value_counts(dropna=False).head(20)

geo_city
Moscow              750928
Saint Petersburg    278402
other_city          164093
Yekaterinburg        33555
Krasnodar            30260
Kazan                27689
Samara               23433
Nizhny Novgorod      20782
Ufa                  20283
Novosibirsk          20115
Krasnoyarsk          15283
Chelyabinsk          14923
Tula                 14774
Rostov-on-Don        13064
Voronezh             12701
Irkutsk              12596
Grozny               11949
Balashikha           11868
Vladivostok          11514
Yaroslavl             9143
Name: count, dtype: int64

In [70]:
com_df['geo_country'].value_counts(dropna=False).head(20)

geo_country
Russia            1682423
Ukraine              8455
United States        8141
Belarus              3432
Kazakhstan           2109
Germany              2085
Ireland              1989
Turkey               1787
Sweden               1731
Netherlands          1451
Uzbekistan           1422
United Kingdom       1371
(not set)            1071
Kyrgyzstan            926
Georgia               881
France                742
Cyprus                706
Armenia               629
Finland               610
Spain                 548
Name: count, dtype: int64

In [71]:
other_country = [country[0] for country in com_df['geo_country'].value_counts(dropna=False).items() if country[1] < 10**4 or country[0] == '(not set)']

com_df['geo_country'] = com_df['geo_country'].replace(other_country, 'other_country')

com_df['geo_country'].value_counts(dropna=False).head()

geo_country
Russia           1682423
other_country      49843
Name: count, dtype: int64

In [72]:
# com_df['year'] = com_df['visit_date'].apply(lambda x: int(x.split('-')[0]))
#
# com_df['month'] = com_df['visit_date'].apply(lambda x: int(x.split('-')[1]))
#
# com_df['day'] = com_df['visit_date'].apply(lambda x: int(x.split('-')[2]))
#
# com_df['year'].value_counts()


year
2021    1732266
Name: count, dtype: int64

In [73]:
# com_df['hour'] = com_df['visit_time'].apply(lambda x: int(x.split(':')[0]))
#
# com_df['hour']

0          14
1           8
2           2
3           5
4           5
           ..
1732261    10
1732262     8
1732263    15
1732264    15
1732265    19
Name: hour, Length: 1732266, dtype: int64

In [74]:
com_df.head()

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,...,device_screen_resolution,device_browser,geo_country,geo_city,target_action,screen_square,year,month,day,hour
0,9055434745589932991.1637753792.1637753792,2108383000.0,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,...,360x720,Chrome,Russia,other_city,0,259200,2021,11,24,14
1,905544597018549464.1636867290.1636867290,210838500.0,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,...,385x854,Samsung Internet,Russia,Moscow,0,328790,2021,11,14,8
2,9055446045651783499.1640648526.1640648526,2108385000.0,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,...,360x720,Chrome,Russia,Krasnoyarsk,0,259200,2021,12,28,2
3,9055447046360770272.1622255328.1622255328,2108386000.0,2021-05-29,05:00:00,1,kjsLglQLzykiRbcDiGcD,cpc,,NOBKLgtuvqYWkXQHeYWM,,...,393x786,Chrome,Russia,Moscow,0,308898,2021,5,29,5
4,9055447046360770272.1622255345.1622255345,2108386000.0,2021-05-29,05:00:00,2,kjsLglQLzykiRbcDiGcD,cpc,,,,...,393x786,Chrome,Russia,Moscow,0,308898,2021,5,29,5


In [75]:
# com_df.drop(columns=['year','client_id', 'visit_date', 'visit_time', 'utm_campaign', 'utm_adcontent', 'utm_keyword','device_screen_resolution'], axis=1, inplace=True)
# com_df.head(5)

In [78]:
# com_df[com_df['screen_square'].isna()]

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,...,device_screen_resolution,device_browser,geo_country,geo_city,target_action,screen_square,year,month,day,hour


In [79]:
# com_df['screen_square'].describe()

count    1.732266e+06
mean     6.091448e+05
std      7.051585e+05
min      0.000000e+00
25%      2.808000e+05
50%      3.344430e+05
75%      3.769800e+05
max      3.200000e+07
Name: screen_square, dtype: float64

Убираю выбросы в разрешении экрана

In [80]:
# def calculate_outliers(data):
#    q25 = data.screen_square.quantile(0.25)
#    q75 = data.screen_square.quantile(0.75)
#    iqr = q75 - q25
#
#    return (q25 - 1.5 * iqr, q75 + 1.5 * iqr)
#
# boundaries = calculate_outliers(com_df)
# is_outlier_min = (com_df.screen_square < boundaries[0])
# is_outlier_max = (com_df.screen_square > boundaries[1])
#
# is_outlier_min.sum()
# is_outlier_max.sum()

In [81]:
# round(is_outlier_min.sum() / len(com_df), 2)

In [82]:
# com_df.loc[is_outlier_min, 'screen_square'] = int(boundaries[0])
# com_df.loc[is_outlier_max, 'screen_square'] = int(boundaries[1])

In [83]:
# com_df['screen_square'].describe()

count    1.732266e+06
mean     6.091448e+05
std      7.051585e+05
min      0.000000e+00
25%      2.808000e+05
50%      3.344430e+05
75%      3.769800e+05
max      3.200000e+07
Name: screen_square, dtype: float64

In [84]:
# com_df.drop(columns=['year','hour', 'screen_square', 'day'], axis=1, inplace=True)

In [85]:
com_df.to_csv('data/clean_df.csv', index=False)