In [3]:
import os
import numpy as np
import pandas as pd
import json
from pandas.io.json import json_normalize
from ast import literal_eval
import random
import matplotlib.pyplot as plt

In [4]:
pd.set_option('display.max_columns', 500)

## Loading data

In [None]:
%%time

# The row indices to skip - make sure 0 is not included to keep the header!
skip_idx = random.sample(range(1, 1708338), 1608338)

In [None]:
%%time

# Read the data
data = pd.read_csv("Data/train.csv", skiprows=skip_idx)

In [None]:
print(data.shape)
data.head()

In [None]:
%%time

data.to_csv('modified_train.csv')

In [6]:
def load_df(csv_path='modified_train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
        
    df['hits'][df['hits'] == "[]"] = "[{}]"
    df['hits'] = df['hits'].apply(literal_eval).str[0]
    hits = pd.io.json.json_normalize(df['hits'])
    hits = hits.drop(['product', 'promotion'], axis=1)
    
    df = pd.concat([df, hits], axis=1, sort=False)
    df = df.drop(['customDimensions', 'hits', 'customMetrics', 'customVariables', 'experiment', 'publisher_infos'], axis=1)
    
    print(f"Shape: {df.shape}")
    return df

In [46]:
%%time

train = load_df(nrows = 99999)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Shape: (99999, 119)
Wall time: 5min 21s


In [47]:
train.head()

Unnamed: 0.1,Unnamed: 0,channelGrouping,date,fullVisitorId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,device.browserVersion,device.deviceCategory,device.flashVersion,device.isMobile,device.language,device.mobileDeviceBranding,device.mobileDeviceInfo,device.mobileDeviceMarketingName,device.mobileDeviceModel,device.mobileInputSelector,device.operatingSystem,device.operatingSystemVersion,device.screenColors,device.screenResolution,geoNetwork.city,geoNetwork.cityId,geoNetwork.continent,geoNetwork.country,geoNetwork.latitude,geoNetwork.longitude,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.networkLocation,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,totals.visits,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.criteriaParameters,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source,appInfo.exitScreenName,appInfo.landingScreenName,appInfo.screenDepth,appInfo.screenName,contentGroup.contentGroup1,contentGroup.contentGroup2,contentGroup.contentGroup3,contentGroup.contentGroup4,contentGroup.contentGroup5,contentGroup.contentGroupUniqueViews1,contentGroup.contentGroupUniqueViews2,contentGroup.contentGroupUniqueViews3,contentGroup.previousContentGroup1,contentGroup.previousContentGroup2,contentGroup.previousContentGroup3,contentGroup.previousContentGroup4,contentGroup.previousContentGroup5,dataSource,eCommerceAction.action_type,eCommerceAction.option,eCommerceAction.step,eventInfo.eventAction,eventInfo.eventCategory,eventInfo.eventLabel,exceptionInfo.isFatal,hitNumber,hour,isEntrance,isExit,isInteraction,item.currencyCode,latencyTracking.domContentLoadedTime,latencyTracking.domInteractiveTime,latencyTracking.domLatencyMetricsSample,latencyTracking.domainLookupTime,latencyTracking.pageDownloadTime,latencyTracking.pageLoadSample,latencyTracking.pageLoadTime,latencyTracking.redirectionTime,latencyTracking.serverConnectionTime,latencyTracking.serverResponseTime,latencyTracking.speedMetricsSample,minute,page.hostname,page.pagePath,page.pagePathLevel1,page.pagePathLevel2,page.pagePathLevel3,page.pagePathLevel4,page.pageTitle,page.searchCategory,page.searchKeyword,promotionActionInfo.promoIsClick,promotionActionInfo.promoIsView,referer,social.hasSocialSourceReferral,social.socialInteractionNetworkAction,social.socialNetwork,time,transaction.currencyCode,type
0,0,Direct,20171016,7992466427990357681,Not Socially Engaged,1508201613,1,1508201613,Chrome,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Americas,United States,not available in demo dataset,not available in demo dataset,not available in demo dataset,windjammercable.net,not available in demo dataset,not available in demo dataset,Northern America,,2,1,2,1,38,,,,1,,,not available in demo dataset,,,,,(not set),True,,(none),,(direct),shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/home,0,shop.googlemerchandisestore.com/home,(not set),(not set),(not set),(not set),(not set),,,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,17,True,,True,,,,,,,,,,,,,53,shop.googlemerchandisestore.com,/home,/home,,,,Home,,,,True,https://www.googlemerchandisestore.com/,No,:,(not set),0,,PAGE
1,1,Referral,20171016,8349655975937271469,Not Socially Engaged,1508152478,1,1508152478,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,not available in demo dataset,not available in demo dataset,London,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,not available in demo dataset,London,(not set),not available in demo dataset,England,Northern Europe,,2,1,2,1,9,,,,1,,,not available in demo dataset,,,,,(not set),,,referral,/a/google.com/nest-vision/dropcam-field-tester...,sites.google.com,shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/home,0,shop.googlemerchandisestore.com/home,(not set),(not set),(not set),(not set),(not set),,,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,4,True,,True,,,,,,,,,,,,,14,shop.googlemerchandisestore.com,/home,/home,,,,Home,,,,True,https://sites.google.com/a/google.com/nest-vis...,No,:,(not set),0,,PAGE
2,2,Organic Search,20171016,1332629902468998662,Not Socially Engaged,1508206208,1,1508206208,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Europe,Denmark,not available in demo dataset,not available in demo dataset,not available in demo dataset,fullrate.ninja,not available in demo dataset,not available in demo dataset,Northern Europe,,2,1,2,1,15,,,,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google,shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/home,0,shop.googlemerchandisestore.com/home,(not set),(not set),(not set),(not set),(not set),,,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,19,True,,True,,,,,,,,,,,,,10,shop.googlemerchandisestore.com,/home,/home,,,,Home,,,,True,https://www.google.dk/,No,:,(not set),0,,PAGE
3,3,Organic Search,20171016,9592294493127192752,Not Socially Engaged,1508197277,1,1508197277,Safari,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,not available in demo dataset,not available in demo dataset,virginm.net,not available in demo dataset,not available in demo dataset,Northern Europe,,2,1,2,1,11,,,,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google,shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/google+redesig...,0,shop.googlemerchandisestore.com/google+redesig...,(not set),Brands,(not set),(not set),(not set),,1.0,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,16,True,,True,USD,,,,,,,,,,,,41,shop.googlemerchandisestore.com,/google+redesign/shop+by+brand/youtube,/google+redesign/,/shop+by+brand/,/youtube,,YouTube | Shop by Brand | Google Merchandise S...,,,,,https://www.google.co.uk/,No,:,(not set),0,USD,PAGE
4,4,Organic Search,20171016,6338477365942527347,Not Socially Engaged,1508177911,1,1508177911,UC Browser,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Asia,India,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,Southern Asia,,2,1,2,1,73,,,,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google,shop.googlemerchandisestore.com/asearch.html,shop.googlemerchandisestore.com/google+redesig...,0,shop.googlemerchandisestore.com/google+redesig...,(not set),Brands,(not set),(not set),(not set),,1.0,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,11,True,,True,USD,,,,,,,,,,,,18,shop.googlemerchandisestore.com,/google+redesign/shop+by+brand/youtube,/google+redesign/,/shop+by+brand/,/youtube,,YouTube | Shop by Brand | Google Merchandise S...,,,,,https://www.google.co.in/,No,:,(not set),0,USD,PAGE


In [9]:
# train = train.iloc[:, 1:]

In [49]:
train_small.shape

(99999, 119)

In [50]:
train_small = train_small.iloc[:, 1:]

In [51]:
train_small.shape

(99999, 118)

In [53]:
train = train_small

In [52]:
train.to_csv('train.csv')

## Cleaning data

In [76]:
train.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,device.browserVersion,device.deviceCategory,device.flashVersion,device.isMobile,device.language,device.mobileDeviceBranding,device.mobileDeviceInfo,device.mobileDeviceMarketingName,device.mobileDeviceModel,device.mobileInputSelector,device.operatingSystem,device.operatingSystemVersion,device.screenColors,device.screenResolution,geoNetwork.city,geoNetwork.cityId,geoNetwork.continent,geoNetwork.country,geoNetwork.latitude,geoNetwork.longitude,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.networkLocation,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,totals.visits,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.criteriaParameters,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source,appInfo.exitScreenName,appInfo.landingScreenName,appInfo.screenDepth,appInfo.screenName,contentGroup.contentGroup1,contentGroup.contentGroup2,contentGroup.contentGroup3,contentGroup.contentGroup4,contentGroup.contentGroup5,contentGroup.contentGroupUniqueViews1,contentGroup.contentGroupUniqueViews2,contentGroup.contentGroupUniqueViews3,contentGroup.previousContentGroup1,contentGroup.previousContentGroup2,contentGroup.previousContentGroup3,contentGroup.previousContentGroup4,contentGroup.previousContentGroup5,dataSource,eCommerceAction.action_type,eCommerceAction.option,eCommerceAction.step,eventInfo.eventAction,eventInfo.eventCategory,eventInfo.eventLabel,exceptionInfo.isFatal,hitNumber,hour,isEntrance,isExit,isInteraction,item.currencyCode,latencyTracking.domContentLoadedTime,latencyTracking.domInteractiveTime,latencyTracking.domLatencyMetricsSample,latencyTracking.domainLookupTime,latencyTracking.pageDownloadTime,latencyTracking.pageLoadSample,latencyTracking.pageLoadTime,latencyTracking.redirectionTime,latencyTracking.serverConnectionTime,latencyTracking.serverResponseTime,latencyTracking.speedMetricsSample,minute,page.hostname,page.pagePath,page.pagePathLevel1,page.pagePathLevel2,page.pagePathLevel3,page.pagePathLevel4,page.pageTitle,page.searchCategory,page.searchKeyword,promotionActionInfo.promoIsClick,promotionActionInfo.promoIsView,referer,social.hasSocialSourceReferral,social.socialInteractionNetworkAction,social.socialNetwork,time,transaction.currencyCode,type
0,Direct,20171016,7992466427990357681,Not Socially Engaged,1508201613,1,1508201613,Chrome,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Americas,United States,not available in demo dataset,not available in demo dataset,not available in demo dataset,windjammercable.net,not available in demo dataset,not available in demo dataset,Northern America,,2,1,2,1,38,,,,1,,,not available in demo dataset,,,,,(not set),True,,(none),,(direct),shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/home,0,shop.googlemerchandisestore.com/home,(not set),(not set),(not set),(not set),(not set),,,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,17,True,,True,,,,,,,,,,,,,53,shop.googlemerchandisestore.com,/home,/home,,,,Home,,,,True,https://www.googlemerchandisestore.com/,No,:,(not set),0,,PAGE
1,Referral,20171016,8349655975937271469,Not Socially Engaged,1508152478,1,1508152478,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,not available in demo dataset,not available in demo dataset,London,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,not available in demo dataset,London,(not set),not available in demo dataset,England,Northern Europe,,2,1,2,1,9,,,,1,,,not available in demo dataset,,,,,(not set),,,referral,/a/google.com/nest-vision/dropcam-field-tester...,sites.google.com,shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/home,0,shop.googlemerchandisestore.com/home,(not set),(not set),(not set),(not set),(not set),,,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,4,True,,True,,,,,,,,,,,,,14,shop.googlemerchandisestore.com,/home,/home,,,,Home,,,,True,https://sites.google.com/a/google.com/nest-vis...,No,:,(not set),0,,PAGE
2,Organic Search,20171016,1332629902468998662,Not Socially Engaged,1508206208,1,1508206208,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Europe,Denmark,not available in demo dataset,not available in demo dataset,not available in demo dataset,fullrate.ninja,not available in demo dataset,not available in demo dataset,Northern Europe,,2,1,2,1,15,,,,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google,shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/home,0,shop.googlemerchandisestore.com/home,(not set),(not set),(not set),(not set),(not set),,,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,19,True,,True,,,,,,,,,,,,,10,shop.googlemerchandisestore.com,/home,/home,,,,Home,,,,True,https://www.google.dk/,No,:,(not set),0,,PAGE
3,Organic Search,20171016,9592294493127192752,Not Socially Engaged,1508197277,1,1508197277,Safari,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,not available in demo dataset,not available in demo dataset,virginm.net,not available in demo dataset,not available in demo dataset,Northern Europe,,2,1,2,1,11,,,,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google,shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/google+redesig...,0,shop.googlemerchandisestore.com/google+redesig...,(not set),Brands,(not set),(not set),(not set),,1.0,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,16,True,,True,USD,,,,,,,,,,,,41,shop.googlemerchandisestore.com,/google+redesign/shop+by+brand/youtube,/google+redesign/,/shop+by+brand/,/youtube,,YouTube | Shop by Brand | Google Merchandise S...,,,,,https://www.google.co.uk/,No,:,(not set),0,USD,PAGE
4,Organic Search,20171016,6338477365942527347,Not Socially Engaged,1508177911,1,1508177911,UC Browser,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Asia,India,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,Southern Asia,,2,1,2,1,73,,,,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google,shop.googlemerchandisestore.com/asearch.html,shop.googlemerchandisestore.com/google+redesig...,0,shop.googlemerchandisestore.com/google+redesig...,(not set),Brands,(not set),(not set),(not set),,1.0,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,11,True,,True,USD,,,,,,,,,,,,18,shop.googlemerchandisestore.com,/google+redesign/shop+by+brand/youtube,/google+redesign/,/shop+by+brand/,/youtube,,YouTube | Shop by Brand | Google Merchandise S...,,,,,https://www.google.co.in/,No,:,(not set),0,USD,PAGE


In [77]:
train_small = train

In [64]:
train_small = train_small.replace(['not available in demo dataset', '(not provided)', '(not set)', ''], np.NaN)

In [79]:
print(train_small.shape)

train_small = train_small.dropna(axis=1, how='all')

print(train_small.shape)

(99999, 118)
(99999, 118)


In [80]:
const_cols = [c for c in train_small.columns if train_small[c].nunique(dropna=False)==1 ]
const_cols

['socialEngagementType',
 'device.browserSize',
 'device.browserVersion',
 'device.flashVersion',
 'device.language',
 'device.mobileDeviceBranding',
 'device.mobileDeviceInfo',
 'device.mobileDeviceMarketingName',
 'device.mobileDeviceModel',
 'device.mobileInputSelector',
 'device.operatingSystemVersion',
 'device.screenColors',
 'device.screenResolution',
 'geoNetwork.cityId',
 'geoNetwork.latitude',
 'geoNetwork.longitude',
 'geoNetwork.networkLocation',
 'totals.visits',
 'trafficSource.adwordsClickInfo.criteriaParameters']

In [81]:
train_small = train_small.drop(columns = const_cols)
print(train_small.shape)

(99999, 99)


In [82]:
train_small['totals.transactionRevenue'] = train_small['totals.transactionRevenue'].fillna(0)
train_small['totals.totalTransactionRevenue'] = train_small['totals.totalTransactionRevenue'].fillna(0)
train_small['totals.transactions'] = train_small['totals.transactions'].fillna(0)

In [83]:
train_small.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source,appInfo.exitScreenName,appInfo.landingScreenName,appInfo.screenDepth,appInfo.screenName,contentGroup.contentGroup1,contentGroup.contentGroup2,contentGroup.contentGroup3,contentGroup.contentGroup4,contentGroup.contentGroup5,contentGroup.contentGroupUniqueViews1,contentGroup.contentGroupUniqueViews2,contentGroup.contentGroupUniqueViews3,contentGroup.previousContentGroup1,contentGroup.previousContentGroup2,contentGroup.previousContentGroup3,contentGroup.previousContentGroup4,contentGroup.previousContentGroup5,dataSource,eCommerceAction.action_type,eCommerceAction.option,eCommerceAction.step,eventInfo.eventAction,eventInfo.eventCategory,eventInfo.eventLabel,exceptionInfo.isFatal,hitNumber,hour,isEntrance,isExit,isInteraction,item.currencyCode,latencyTracking.domContentLoadedTime,latencyTracking.domInteractiveTime,latencyTracking.domLatencyMetricsSample,latencyTracking.domainLookupTime,latencyTracking.pageDownloadTime,latencyTracking.pageLoadSample,latencyTracking.pageLoadTime,latencyTracking.redirectionTime,latencyTracking.serverConnectionTime,latencyTracking.serverResponseTime,latencyTracking.speedMetricsSample,minute,page.hostname,page.pagePath,page.pagePathLevel1,page.pagePathLevel2,page.pagePathLevel3,page.pagePathLevel4,page.pageTitle,page.searchCategory,page.searchKeyword,promotionActionInfo.promoIsClick,promotionActionInfo.promoIsView,referer,social.hasSocialSourceReferral,social.socialInteractionNetworkAction,social.socialNetwork,time,transaction.currencyCode,type
0,Direct,20171016,7992466427990357681,1508201613,1,1508201613,Chrome,mobile,True,Android,not available in demo dataset,Americas,United States,not available in demo dataset,windjammercable.net,not available in demo dataset,Northern America,,2,1,2,1,38,0,0,0,,,,,,,(not set),True,,(none),,(direct),shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/home,0,shop.googlemerchandisestore.com/home,(not set),(not set),(not set),(not set),(not set),,,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,17,True,,True,,,,,,,,,,,,,53,shop.googlemerchandisestore.com,/home,/home,,,,Home,,,,True,https://www.googlemerchandisestore.com/,No,:,(not set),0,,PAGE
1,Referral,20171016,8349655975937271469,1508152478,1,1508152478,Chrome,desktop,False,Macintosh,London,Europe,United Kingdom,London,(not set),England,Northern Europe,,2,1,2,1,9,0,0,0,,,,,,,(not set),,,referral,/a/google.com/nest-vision/dropcam-field-tester...,sites.google.com,shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/home,0,shop.googlemerchandisestore.com/home,(not set),(not set),(not set),(not set),(not set),,,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,4,True,,True,,,,,,,,,,,,,14,shop.googlemerchandisestore.com,/home,/home,,,,Home,,,,True,https://sites.google.com/a/google.com/nest-vis...,No,:,(not set),0,,PAGE
2,Organic Search,20171016,1332629902468998662,1508206208,1,1508206208,Chrome,desktop,False,Windows,not available in demo dataset,Europe,Denmark,not available in demo dataset,fullrate.ninja,not available in demo dataset,Northern Europe,,2,1,2,1,15,0,0,0,,,,,,,(not set),,(not provided),organic,,google,shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/home,0,shop.googlemerchandisestore.com/home,(not set),(not set),(not set),(not set),(not set),,,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,19,True,,True,,,,,,,,,,,,,10,shop.googlemerchandisestore.com,/home,/home,,,,Home,,,,True,https://www.google.dk/,No,:,(not set),0,,PAGE
3,Organic Search,20171016,9592294493127192752,1508197277,1,1508197277,Safari,desktop,False,Macintosh,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,virginm.net,not available in demo dataset,Northern Europe,,2,1,2,1,11,0,0,0,,,,,,,(not set),,(not provided),organic,,google,shop.googlemerchandisestore.com/google+redesig...,shop.googlemerchandisestore.com/google+redesig...,0,shop.googlemerchandisestore.com/google+redesig...,(not set),Brands,(not set),(not set),(not set),,1.0,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,16,True,,True,USD,,,,,,,,,,,,41,shop.googlemerchandisestore.com,/google+redesign/shop+by+brand/youtube,/google+redesign/,/shop+by+brand/,/youtube,,YouTube | Shop by Brand | Google Merchandise S...,,,,,https://www.google.co.uk/,No,:,(not set),0,USD,PAGE
4,Organic Search,20171016,6338477365942527347,1508177911,1,1508177911,UC Browser,mobile,True,Android,not available in demo dataset,Asia,India,not available in demo dataset,(not set),not available in demo dataset,Southern Asia,,2,1,2,1,73,0,0,0,,,,,,,(not set),,(not provided),organic,,google,shop.googlemerchandisestore.com/asearch.html,shop.googlemerchandisestore.com/google+redesig...,0,shop.googlemerchandisestore.com/google+redesig...,(not set),Brands,(not set),(not set),(not set),,1.0,,(entrance),(entrance),(entrance),(entrance),(entrance),web,0,,1,,,,True,1,11,True,,True,USD,,,,,,,,,,,,18,shop.googlemerchandisestore.com,/google+redesign/shop+by+brand/youtube,/google+redesign/,/shop+by+brand/,/youtube,,YouTube | Shop by Brand | Google Merchandise S...,,,,,https://www.google.co.in/,No,:,(not set),0,USD,PAGE


In [84]:
keep_cols = ['channelGrouping', 'date', 'fullVisitorId', 'visitId', 'visitNumber', 'visitStartTime', 'device.browser',
             'device.deviceCategory', 'device.operatingSystem', 'geoNetwork.continent', 'geoNetwork.country', 
             'geoNetwork.region', 'totals.bounces', 'totals.hits', 'totals.newVisits', 'totals.pageviews', 
             'totals.sessionQualityDim', 'totals.timeOnSite', 'totals.totalTransactionRevenue', 
             'totals.transactionRevenue', 'totals.transactions', 'trafficSource.adContent',
             'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.page', 
             'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign', 'trafficSource.keyword', 
             'trafficSource.source', 'appInfo.exitScreenName', 'contentGroup.contentGroup2', 
             'contentGroup.contentGroup3', 'eCommerceAction.action_type', 'eventInfo.eventAction']

In [85]:
train_small = train_small[keep_cols]

In [86]:
print(train_small.shape)

train_small.head()

(99999, 33)


Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.operatingSystem,geoNetwork.continent,geoNetwork.country,geoNetwork.region,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.keyword,trafficSource.source,appInfo.exitScreenName,contentGroup.contentGroup2,contentGroup.contentGroup3,eCommerceAction.action_type,eventInfo.eventAction
0,Direct,20171016,7992466427990357681,1508201613,1,1508201613,Chrome,mobile,Android,Americas,United States,not available in demo dataset,,2,1,2,1,38,0,0,0,,,,,(not set),,(direct),shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0,
1,Referral,20171016,8349655975937271469,1508152478,1,1508152478,Chrome,desktop,Macintosh,Europe,United Kingdom,England,,2,1,2,1,9,0,0,0,,,,,(not set),,sites.google.com,shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0,
2,Organic Search,20171016,1332629902468998662,1508206208,1,1508206208,Chrome,desktop,Windows,Europe,Denmark,not available in demo dataset,,2,1,2,1,15,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0,
3,Organic Search,20171016,9592294493127192752,1508197277,1,1508197277,Safari,desktop,Macintosh,Europe,United Kingdom,not available in demo dataset,,2,1,2,1,11,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/google+redesig...,Brands,(not set),0,
4,Organic Search,20171016,6338477365942527347,1508177911,1,1508177911,UC Browser,mobile,Android,Asia,India,not available in demo dataset,,2,1,2,1,73,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/asearch.html,Brands,(not set),0,


## Formatting data

In [87]:
print("Number of unique visitors in train set : ",train_small.fullVisitorId.nunique(), " out of rows : ",train_small.shape[0])
# print("Number of unique visitors in test set : ",test_df.fullVisitorId.nunique(), " out of rows : ",test_df.shape[0])
# print("Number of common visitors in train and test set : ",len(set(train_small.fullVisitorId.unique()).intersection(set(test_df.fullVisitorId.unique())) ))

Number of unique visitors in train set :  96387  out of rows :  99999


In [88]:
print(train_small['channelGrouping'].unique())

train_small['channelGrouping'] = train_small['channelGrouping'].astype('category')

['Direct' 'Referral' 'Organic Search' 'Affiliates' 'Paid Search' 'Social'
 'Display' '(Other)']


In [89]:
# # label encode the categorical variables and convert the numerical variables to float
# cat_cols = ["channelGrouping",
#             "device.browser", 
#             "device.deviceCategory", "device.operatingSystem", 
#             "geoNetwork.city", "geoNetwork.continent", 
#             "geoNetwork.country", "geoNetwork.metro",
#             "geoNetwork.networkDomain", "geoNetwork.region", 
#             "geoNetwork.subContinent", "trafficSource.adContent", 
#             "trafficSource.adwordsClickInfo.adNetworkType", 
#             "trafficSource.adwordsClickInfo.gclId", 
#             "trafficSource.adwordsClickInfo.page", 
#             "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
#             "trafficSource.keyword", "trafficSource.medium", 
#             "trafficSource.referralPath", "trafficSource.source",
#             'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']
# for col in cat_cols:
#     print(col)
#     lbl = preprocessing.LabelEncoder()
#     lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
#     train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
#     test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))


In [90]:
train_small.memory_usage()

Index                                               80
channelGrouping                                 100383
date                                            799992
fullVisitorId                                   799992
visitId                                         799992
visitNumber                                     799992
visitStartTime                                  799992
device.browser                                  799992
device.deviceCategory                           799992
device.operatingSystem                          799992
geoNetwork.continent                            799992
geoNetwork.country                              799992
geoNetwork.region                               799992
totals.bounces                                  799992
totals.hits                                     799992
totals.newVisits                                799992
totals.pageviews                                799992
totals.sessionQualityDim                        799992
totals.tim

In [91]:
x = train_small['date']

x.dtype

dtype('int64')

In [92]:
year = []
month = []
day = []
for each in x:
    each = str(each)
    year.append(int(each[0:4]))
    month.append(int(each[4:6]))
    day.append(int(each[6:8]))

In [93]:
Year = pd.DataFrame(year).rename(columns = {0:'Year'})

print(Year.shape)
Year.head()

(99999, 1)


Unnamed: 0,Year
0,2017
1,2017
2,2017
3,2017
4,2017


In [94]:
Month = pd.DataFrame(month).rename(columns = {0:'Month'})

print(Month.shape)
Month.head()

(99999, 1)


Unnamed: 0,Month
0,10
1,10
2,10
3,10
4,10


In [95]:
Day = pd.DataFrame(day).rename(columns = {0:'Day'})

print(Day.shape)
Day.head()

(99999, 1)


Unnamed: 0,Day
0,16
1,16
2,16
3,16
4,16


In [96]:
train_small = pd.concat([train_small, Year, Month, Day], axis = 1)

train_small.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.operatingSystem,geoNetwork.continent,geoNetwork.country,geoNetwork.region,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.keyword,trafficSource.source,appInfo.exitScreenName,contentGroup.contentGroup2,contentGroup.contentGroup3,eCommerceAction.action_type,eventInfo.eventAction,Year,Month,Day
0,Direct,20171016,7992466427990357681,1508201613,1,1508201613,Chrome,mobile,Android,Americas,United States,not available in demo dataset,,2,1,2,1,38,0,0,0,,,,,(not set),,(direct),shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0,,2017,10,16
1,Referral,20171016,8349655975937271469,1508152478,1,1508152478,Chrome,desktop,Macintosh,Europe,United Kingdom,England,,2,1,2,1,9,0,0,0,,,,,(not set),,sites.google.com,shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0,,2017,10,16
2,Organic Search,20171016,1332629902468998662,1508206208,1,1508206208,Chrome,desktop,Windows,Europe,Denmark,not available in demo dataset,,2,1,2,1,15,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0,,2017,10,16
3,Organic Search,20171016,9592294493127192752,1508197277,1,1508197277,Safari,desktop,Macintosh,Europe,United Kingdom,not available in demo dataset,,2,1,2,1,11,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/google+redesig...,Brands,(not set),0,,2017,10,16
4,Organic Search,20171016,6338477365942527347,1508177911,1,1508177911,UC Browser,mobile,Android,Asia,India,not available in demo dataset,,2,1,2,1,73,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/asearch.html,Brands,(not set),0,,2017,10,16


In [97]:
train_small['date'] = train_small['date'].astype('str')

train_small['date'] = pd.to_datetime(train_small['date'], yearfirst=True)

In [98]:
y = train_small.index[((train_small['date'] == '2018-1-1') |
               ((train_small['date'] > '2018-2-12') & (train_small['date'] <= '2018-2-19')) |
               ((train_small['date'] > '2018-5-25') & (train_small['date'] <= '2018-5-28')) |
               ((train_small['date'] > '2018-6-29') & (train_small['date'] <= '2018-7-4')) |
               ((train_small['date'] > '2018-8-25') & (train_small['date'] <= '2018-8-31')) |
               ((train_small['date'] > '2018-9-1') & (train_small['date'] <= '2018-9-3')) |
               ((train_small['date'] > '2018-11-20') & (train_small['date'] <= '2018-11-28')) |
               ((train_small['date'] > '2018-12-20') & (train_small['date'] <= '2018-12-31')) |

               (train_small['date'] == '2017-1-1') |
               ((train_small['date'] > '2017-2-12') & (train_small['date'] <= '2017-2-19')) |
               ((train_small['date'] > '2017-5-26') & (train_small['date'] <= '2017-5-29')) |
               ((train_small['date'] > '2017-6-29') & (train_small['date'] <= '2017-7-4')) |
               ((train_small['date'] > '2017-8-25') & (train_small['date'] <= '2017-8-31')) |
               ((train_small['date'] > '2017-9-2') & (train_small['date'] <= '2017-9-4')) |
               ((train_small['date'] > '2017-11-21') & (train_small['date'] <= '2017-11-29')) |
               ((train_small['date'] > '2017-12-20') & (train_small['date'] <= '2017-12-31')) |

               (train_small['date'] == '2016-1-1') |
               ((train_small['date'] > '2016-2-12') & (train_small['date'] <= '2016-2-19')) |
               ((train_small['date'] > '2016-5-27') & (train_small['date'] <= '2016-5-30')) |
               ((train_small['date'] > '2016-6-29') & (train_small['date'] <= '2016-7-4')) |
               ((train_small['date'] > '2016-8-25') & (train_small['date'] <= '2016-8-31')) |
               ((train_small['date'] > '2016-9-2') & (train_small['date'] <= '2016-9-5')) |
               ((train_small['date'] > '2016-11-22') & (train_small['date'] <= '2016-11-30')) |
               ((train_small['date'] > '2016-12-20') & (train_small['date'] <= '2016-12-31')) |

               (train_small['date'] == '2019-1-1') |
               ((train_small['date'] > '2019-2-12') & (train_small['date'] <= '2016-2-19')))]

In [99]:
is_holiday = []
for index in range(0, train_small.shape[0]):
    if index in y:
        is_holiday.append(True)
    else:
        is_holiday.append(False)

In [100]:
is_holiday = pd.DataFrame(is_holiday).rename(columns = {0:'is_holiday'})

print(is_holiday.shape)
is_holiday.head()

(99999, 1)


Unnamed: 0,is_holiday
0,False
1,False
2,False
3,False
4,False


In [101]:
train_small = pd.concat([train_small, is_holiday], axis = 1)

In [102]:
train_small.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.operatingSystem,geoNetwork.continent,geoNetwork.country,geoNetwork.region,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.keyword,trafficSource.source,appInfo.exitScreenName,contentGroup.contentGroup2,contentGroup.contentGroup3,eCommerceAction.action_type,eventInfo.eventAction,Year,Month,Day,is_holiday
0,Direct,2017-10-16,7992466427990357681,1508201613,1,1508201613,Chrome,mobile,Android,Americas,United States,not available in demo dataset,,2,1,2,1,38,0,0,0,,,,,(not set),,(direct),shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0,,2017,10,16,False
1,Referral,2017-10-16,8349655975937271469,1508152478,1,1508152478,Chrome,desktop,Macintosh,Europe,United Kingdom,England,,2,1,2,1,9,0,0,0,,,,,(not set),,sites.google.com,shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0,,2017,10,16,False
2,Organic Search,2017-10-16,1332629902468998662,1508206208,1,1508206208,Chrome,desktop,Windows,Europe,Denmark,not available in demo dataset,,2,1,2,1,15,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0,,2017,10,16,False
3,Organic Search,2017-10-16,9592294493127192752,1508197277,1,1508197277,Safari,desktop,Macintosh,Europe,United Kingdom,not available in demo dataset,,2,1,2,1,11,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/google+redesig...,Brands,(not set),0,,2017,10,16,False
4,Organic Search,2017-10-16,6338477365942527347,1508177911,1,1508177911,UC Browser,mobile,Android,Asia,India,not available in demo dataset,,2,1,2,1,73,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/asearch.html,Brands,(not set),0,,2017,10,16,False


In [103]:
train_small['visitId'] = train_small['visitId'].astype('int')

In [104]:
train_small['visitNumber'] = train_small['visitNumber'].astype('int')

In [108]:
browsers = ['Chrome', 'Safari', 'Samsung Internet', 'Firefox', 'Internet Explorer', 'Edge']

# for index in range(0, train_small.shape[0]):
#     if (train_small.iloc[index, 6] in browsers):
#         train_small.iloc[index, 6] = train_small.iloc[index, 6]
#     elif (train_small.iloc[index, 6] == 'Safari (in-app)'):
#         train_small.iloc[index, 6] = 'Safari'
#     elif (train_small.iloc[index, 6] == 'Mozilla'):
#         train_small.iloc[index, 6] = 'Firefox'
#     else:
#         train_small.iloc[index, 6] = 'Other'

# # print(train_small['device.browser'].unique())
# # train_small['device.browser'] = train_small['device.browser'].astype('category')

In [106]:
train_small.to_csv('train_updated.csv')

In [117]:
train_small2 = pd.read_csv('train_updated.csv')
train_small2.shape

  interactivity=interactivity, compiler=compiler, result=result)


(99999, 38)

In [118]:
train_small2.head()

Unnamed: 0.1,Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,device.operatingSystem,geoNetwork.continent,geoNetwork.country,geoNetwork.region,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.keyword,trafficSource.source,appInfo.exitScreenName,contentGroup.contentGroup2,contentGroup.contentGroup3,eCommerceAction.action_type,eventInfo.eventAction,Year,Month,Day,is_holiday
0,0,Direct,2017-10-16,7992466427990357681,1508201613,1,1508201613,Chrome,mobile,Android,Americas,United States,not available in demo dataset,,2,1.0,2.0,1.0,38.0,0,0,0,,,,,(not set),,(direct),shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0.0,,2017,10,16,False
1,1,Referral,2017-10-16,8349655975937271469,1508152478,1,1508152478,Chrome,desktop,Macintosh,Europe,United Kingdom,England,,2,1.0,2.0,1.0,9.0,0,0,0,,,,,(not set),,sites.google.com,shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0.0,,2017,10,16,False
2,2,Organic Search,2017-10-16,1332629902468998662,1508206208,1,1508206208,Chrome,desktop,Windows,Europe,Denmark,not available in demo dataset,,2,1.0,2.0,1.0,15.0,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0.0,,2017,10,16,False
3,3,Organic Search,2017-10-16,9592294493127192752,1508197277,1,1508197277,Safari,desktop,Macintosh,Europe,United Kingdom,not available in demo dataset,,2,1.0,2.0,1.0,11.0,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/google+redesig...,Brands,(not set),0.0,,2017,10,16,False
4,4,Organic Search,2017-10-16,6338477365942527347,1508177911,1,1508177911,UC Browser,mobile,Android,Asia,India,not available in demo dataset,,2,1.0,2.0,1.0,73.0,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/asearch.html,Brands,(not set),0.0,,2017,10,16,False


In [119]:
train_small = train_small2.iloc[:,1:]

In [120]:
device_browser = []

for each in train_small['device.browser']:
    if each in browsers:
        device_browser.append(each)
    elif each == 'Safari (in-app)':
        device_browser.append('Safari')
    elif each == 'Mozilla':
        device_browser.append('Firefox')
    else:
        device_browser.append('Other')

device_browser = pd.DataFrame(device_browser).rename(columns = {0:'device_browser'})      

train_small = pd.concat([train_small, device_browser], axis = 1)
train_small = train_small.drop(columns=['device.browser'])

# train_small['device.browser'] = train_small['device.browser'].astype('category')

In [121]:
train_small.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.deviceCategory,device.operatingSystem,geoNetwork.continent,geoNetwork.country,geoNetwork.region,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.sessionQualityDim,totals.timeOnSite,totals.totalTransactionRevenue,totals.transactionRevenue,totals.transactions,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.keyword,trafficSource.source,appInfo.exitScreenName,contentGroup.contentGroup2,contentGroup.contentGroup3,eCommerceAction.action_type,eventInfo.eventAction,Year,Month,Day,is_holiday,device_browser
0,Direct,2017-10-16,7992466427990357681,1508201613,1,1508201613,mobile,Android,Americas,United States,not available in demo dataset,,2,1.0,2.0,1.0,38.0,0,0,0,,,,,(not set),,(direct),shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0.0,,2017,10,16,False,Chrome
1,Referral,2017-10-16,8349655975937271469,1508152478,1,1508152478,desktop,Macintosh,Europe,United Kingdom,England,,2,1.0,2.0,1.0,9.0,0,0,0,,,,,(not set),,sites.google.com,shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0.0,,2017,10,16,False,Chrome
2,Organic Search,2017-10-16,1332629902468998662,1508206208,1,1508206208,desktop,Windows,Europe,Denmark,not available in demo dataset,,2,1.0,2.0,1.0,15.0,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/google+redesig...,(not set),(not set),0.0,,2017,10,16,False,Chrome
3,Organic Search,2017-10-16,9592294493127192752,1508197277,1,1508197277,desktop,Macintosh,Europe,United Kingdom,not available in demo dataset,,2,1.0,2.0,1.0,11.0,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/google+redesig...,Brands,(not set),0.0,,2017,10,16,False,Safari
4,Organic Search,2017-10-16,6338477365942527347,1508177911,1,1508177911,mobile,Android,Asia,India,not available in demo dataset,,2,1.0,2.0,1.0,73.0,0,0,0,,,,,(not set),(not provided),google,shop.googlemerchandisestore.com/asearch.html,Brands,(not set),0.0,,2017,10,16,False,Other


In [None]:
# remove 'date', 'visitStartTime', 'visitId', 'fullVisitorId'

In [122]:
train_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 37 columns):
channelGrouping                                 99999 non-null object
date                                            99999 non-null object
fullVisitorId                                   99999 non-null object
visitId                                         99999 non-null int64
visitNumber                                     99999 non-null int64
visitStartTime                                  99999 non-null int64
device.deviceCategory                           99999 non-null object
device.operatingSystem                          99999 non-null object
geoNetwork.continent                            99999 non-null object
geoNetwork.country                              99999 non-null object
geoNetwork.region                               99999 non-null object
totals.bounces                                  50838 non-null float64
totals.hits                                     99999

In [None]:
train_small['totals.bounces'] = train_small['totals.bounces'].fillna(0)

In [None]:
train_small.head()

In [None]:
basket = []
for each in train_small['appInfo.exitScreenName']:
    each = str(each)
    if 'basket' in each:
        basket.append(True)
    else:
        basket.append(False)

In [None]:
basket

In [None]:
count = 0
for each in basket:
    if each == True:
        count += 1

In [None]:
count

In [None]:
Basket = pd.DataFrame(basket).rename(columns = {0:'Basket'})

In [None]:
Basket.head()

In [None]:
train_small = pd.concat([train_small, Basket], axis = 1)

In [None]:
train_small.head()

In [None]:
train_small = train_small.drop(['date', 'appInfo.exitScreenName'], axis = 1 )

In [None]:
train_small.head()

In [None]:
train_small['trafficSource.source'].unique()

In [None]:
train_small['eventInfo.eventAction'].unique()

In [None]:
train_small = train_small.drop('eCommerceAction.action_type', axis = 1)

In [None]:
train_small.head()

In [None]:
#plot data
fig, ax = plt.subplots(figsize=(10,7))
train_small.groupby(['Month', 'channelGrouping']).count()['totals.totalTransactionRevenue'].unstack().plot(ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,7))
train_small.groupby(['Year', 'channelGrouping']).count()['totals.totalTransactionRevenue'].unstack().plot(ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,7))
train_small.groupby(['Day', 'channelGrouping']).count()['totals.totalTransactionRevenue'].unstack().plot(ax=ax)
plt.show()