In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#Importing Libraries
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import os as os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from scipy import sparse
from scipy.sparse import hstack
from scipy.sparse import csc_matrix,csr_matrix
import scipy.sparse

In [3]:
#Loading all the files:
datadir='/content/drive/MyDrive/TalkingData/'
gender_age_train = pd.read_csv(os.path.join(datadir,'gender_age_train.csv')).drop(['gender','age'],axis=1)
gender_age_test = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'))
phone_brand_device_model = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv')).drop_duplicates('device_id',keep='first')
events_data = pd.read_csv(os.path.join(datadir,'events.csv'))
apps_events_data = pd.read_csv(os.path.join(datadir,'app_events.csv'), usecols=['event_id','app_id','is_active'],dtype={'is_active':bool})
apps_label_data = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
label_categories_data = pd.read_csv(os.path.join(datadir,'label_categories.csv'))

In [4]:
print('gender_age_train.shape:',gender_age_train.shape)
print('gender_age_test.shape:',gender_age_test.shape)

gender_age_train.shape: (74645, 2)
gender_age_test.shape: (112071, 1)


In [5]:
gender_age_train.head()

Unnamed: 0,device_id,group
0,-8076087639492063270,M32-38
1,-2897161552818060146,M32-38
2,-8260683887967679142,M32-38
3,-4938849341048082022,M29-31
4,245133531816851882,M29-31


In [6]:
gender_age_test.head()

Unnamed: 0,device_id
0,1002079943728939269
1,-1547860181818787117
2,7374582448058474277
3,-6220210354783429585
4,-5893464122623104785


In [7]:
#Preprocessing-->Merging features to train data

#Task-1:
#All the DeviceIds has the features Phone brand & Phone brand data
#So we shall the merge the features to all the given Train & Test Data

gender_age_train=gender_age_train.merge(phone_brand_device_model,on='device_id') 
gender_age_test=gender_age_test.merge(phone_brand_device_model,on='device_id') 

#Task-2:
#Collecting Device Ids which has the Events Data and storing it in a list:
DeviceID_Appdata=apps_events_data
evdata_dict=pd.Series(events_data.device_id.values,index=events_data.event_id).to_dict()
DeviceID_Appdata['event_id']=DeviceID_Appdata['event_id'].map(evdata_dict)
DeviceID_Appdata=DeviceID_Appdata.rename(columns={'event_id':'device_id'})
Device_ids_with_Events=DeviceID_Appdata.device_id.unique()

#Task-3:
#Creating new columns 'has events'=1/0 which identifies if the Device has the events or not
#this feature is also used to train the data without events

gender_age_train['has_events']=gender_age_train.device_id.apply(lambda x:1 if x in Device_ids_with_Events else 0)  
gender_age_test['has_events']=gender_age_test.device_id.apply(lambda x:1 if x in Device_ids_with_Events else 0)  
#segregating the test data with noevents
gender_age_test_Noevents=gender_age_test.loc[gender_age_test['has_events']==0]
gender_age_test_Events=gender_age_test.loc[gender_age_test['has_events']==1]
#Notes:
#1.We use all the train data with phone brand & model features for training the models to predict test data without events
#2.While Handling we use only the train data which has events data for training  to predict the test data which has events
#3.The Train & Test Data gets filtered out into Data with Events when we are merging the event data to th Actual Training Dataset
#So no need of splitting the data in special.

In [8]:
gender_age_train.head()

Unnamed: 0,device_id,group,phone_brand,device_model,has_events
0,-8076087639492063270,M32-38,小米,MI 2,0
1,-2897161552818060146,M32-38,小米,MI 2,0
2,-8260683887967679142,M32-38,小米,MI 2,1
3,-4938849341048082022,M29-31,小米,红米note,0
4,245133531816851882,M29-31,小米,MI 3,0


In [9]:
gender_age_test_Events.head()

Unnamed: 0,device_id,phone_brand,device_model,has_events
0,1002079943728939269,小米,小米note,1
1,-1547860181818787117,小米,红米2,1
2,7374582448058474277,华为,Y523-L176,1
3,-6220210354783429585,华为,荣耀6,1
10,6873889408535437611,魅族,MX4 Pro,1


In [10]:
#Preprcoessing:
#Here we shall create a Dataframe of features for all the Device IDs availablee
#And then we just need to pick the Train Data and Test Data from it aa per the Device Ids mentioned in them.

#Task 1:
#Creating Hour Feature from the data available in events_data:
Device_Hour=events_data.drop(columns=['longitude','event_id','latitude'])
Device_Hour['timestamp']=pd.to_datetime(Device_Hour['timestamp'])
Device_Hour['Hour']=Device_Hour['timestamp'].apply(lambda x: x.hour)
Device_Hour=Device_Hour.groupby("device_id")["Hour"].apply(lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" ")))).reset_index()

#Creating Day Feature from the data available in events_data:
Device_Day=events_data.drop(columns=['longitude','event_id','latitude'])
Device_Day['timestamp']=pd.to_datetime(Device_Day['timestamp'])
Device_Day['Day']=Device_Day['timestamp'].apply(lambda x: x.day)
Device_Day=Device_Day.drop(columns=['timestamp'])
Device_Day=Device_Day.groupby("device_id")['Day'].apply(lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" ")))).reset_index()

#Creating App Label ID Feature from the data available in events_data:
apps_events_data = pd.read_csv(os.path.join(datadir,'app_events.csv'), usecols=['event_id','app_id','is_active'],dtype={'is_active':bool})
Device_App_label_Id=apps_events_data
apps_category=pd.merge(apps_label_data,label_categories_data,on='label_id',how='left')
label_id__dict=pd.Series(apps_category.label_id.values,index=apps_category.app_id).to_dict()
Device_App_label_Id['app_id']=Device_App_label_Id['app_id'].map(label_id__dict)
Device_App_label_Id=Device_App_label_Id.rename(columns={'app_id':'app_label_id'})
evdata_dict=pd.Series(events_data.device_id.values,index=events_data.event_id).to_dict()
Device_App_label_Id['event_id']=Device_App_label_Id['event_id'].map(evdata_dict)
Device_App_label_Id=Device_App_label_Id.rename(columns={'event_id':'device_id'})
Device_App_label_Id = Device_App_label_Id.groupby("device_id")["app_label_id"].apply(lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" ")))).reset_index()


#Creating  App ID Feature from the data available in events_data:
apps_events_data = pd.read_csv(os.path.join(datadir,'app_events.csv'), usecols=['event_id','app_id','is_active'],dtype={'is_active':bool})
Device_App_Id=apps_events_data
Device_App_Id['event_id']=Device_App_Id['event_id'].map(evdata_dict)
Device_App_Id=Device_App_Id.rename(columns={'event_id':'device_id'})
Device_App_Id = Device_App_Id.groupby("device_id")["app_id"].apply(lambda x: " ".join(set(str(" ".join(str(s) for s in x)).split(" ")))).reset_index()




#Creating median Latitiude & Longitude along Missing Location Feature:
Device_lat_data = (events_data.groupby("device_id")["latitude"].apply(lambda x: np.median([float(s) for s in x])).to_frame().reset_index())
Device_long_data = (events_data.groupby("device_id")["longitude"].apply(lambda x: np.median([float(s) for s in x])).to_frame().reset_index())
Device_lat_long=Device_long_data.merge(Device_lat_data[['latitude']], how='left', left_index=True, right_index=True)
outlier_devices=Device_lat_long.loc[Device_lat_long['latitude']==0.00].loc[Device_long_data['longitude']==0.00]['device_id'].unique()
Device_lat_long['missing_location']=Device_lat_long.device_id.apply(lambda x:1 if x in outlier_devices else 0)

#Merging all above features into a single Dataframe:
Device_feature_data=Device_App_Id.merge(Device_App_label_Id,on='device_id').merge(Device_Hour,on='device_id').merge(Device_Day,on='device_id').merge(Device_lat_long,on='device_id')
Device_feature_data.head()

Unnamed: 0,device_id,app_id,app_label_id,Hour,Day,longitude,latitude,missing_location
0,-9222956879900151005,2689721421138748406 -1596342834117879984 -3369...,303 183 262 158 172 1007 548 230 209 152 854 4...,7 21 23 14 20 11 12 15 13,7 6,113.24,23.19,0
1,-9222661944218806987,5927333115845830913 628020936226491308 -790408...,262 172 178 548 27 232 204 128,22 21 0 19 18,7 1 3 5 4 2 6,0.0,0.0,1
2,-9222399302879214035,628020936226491308 -538061441862183033 5190837...,131 262 134 179 168 229 129 1007 704 177 135 7...,21 10 23 11 13,4 2 3 6,0.0,0.0,1
3,-9221825537663503111,628020936226491308 6284164581582112235 7316914...,183 262 172 100 134 549 1007 548 179 263 1012 ...,7 9 8 21 10 14 19 11 12 6 13,7 1 5 4 2 6,113.48,33.63,0
4,-9221767098072603291,628020936226491308 4296637564570566271 -572007...,303 262 172 694 144 302 548 549 207 178 179 13...,7 14 5 12 18 15 13,1 3 5 4 2,0.0,0.0,1


In [None]:
Train_Data_events = gender_age_train.merge(Device_feature_data,on='device_id')
Test_Data_events = gender_age_test_Events.merge(Device_feature_data,on='device_id')

In [None]:
Train_Data_events.head()

Unnamed: 0,device_id,group,phone_brand,device_model,has_events,app_id,app_label_id,is_active,Hour,Day,longitude,latitude,missing_location
0,-8260683887967679142,M32-38,小米,MI 2,1,-1442117569095077934 3987595867590771109 10887...,179 232 212 151 129 737 209 730 932 549 13 695...,47,14,1,0.0,0.0,1
1,7477216237379271436,F33-42,华为,荣耀6 plus,1,5786744307703945981 4348659952760821294 371704...,179 694 262 252 854 730 932 549 236 172 100 30...,44,14 18,6 4,119.57,31.75,0
2,6352067998666467520,M32-38,华为,荣耀畅玩4X,1,-7377004479023402858 4348659952760821294 51742...,549 969 1011 1019 179 172 183 262 704 251 1005...,42,9 10 17 18 12 22,5 3 4,0.0,0.0,1
3,1508636020748379883,F27-28,华为,荣耀畅玩4X,1,4348659952760821294 8483751493632839871 869396...,549 166 862 236 172 262 128 152 167 1010 131 2...,45,21 12 13 17,1 5 3 4,120.26,31.9,0
4,-6876541075223249434,M39+,魅族,魅蓝NOTE,1,-145658454108286043 -1633873313139722876 43486...,1011 266 207 204 179 232 212 151 694 187 262 2...,220,2 13 8 17 16,1 6 2 4,117.2,39.14,0


In [None]:
Test_Data_events.head()

Unnamed: 0,device_id,phone_brand,device_model,has_events,app_id,app_label_id,is_active,Hour,Day,longitude,latitude,missing_location
0,1002079943728939269,小米,小米note,1,-4532036554977283654 4348659952760821294 -1456...,179 1014 262 1020 730 172 1015 163 303 1017 87...,213,9 8 21 16 22,6 2 7 3 1 5,0.0,0.0,1
1,-1547860181818787117,小米,红米2,1,4348659952760821294 -3422244824740738044 -6212...,549 1008 854 231 221 172 262 704 1015 251 1007...,70,13 14 18 20,6 2 4 3 1,0.0,0.0,1
2,7374582448058474277,华为,Y523-L176,1,758245148452694196 8693964245073640147 -534578...,549 27 179 172 302 147 704 548 303,37,21 16 19,2 7 4,0.0,0.0,1
3,-6220210354783429585,华为,荣耀6,1,9220205176760015004 -7955927391436822972 -8581...,179 262 131 730 549 6 219 236 172 256 152 1015...,128,12 0 23 7,1 6 5 7,0.0,0.0,1
4,6873889408535437611,魅族,MX4 Pro,1,7875772580533910613 4348659952760821294 371704...,179 262 131 209 730 1019 549 756 236 172 256 1...,34,9,5,0.0,0.0,1


In [None]:
Test_Data_events.to_csv('/content/drive/MyDrive/TalkingData/Processed_Data/Test_Data_events.csv')

In [None]:
#1.Processing Phone Brand feature:
countvectorizer_brand=CountVectorizer()
countvectorizer_brand.fit(phone_brand_device_model['phone_brand'])
x_brand_tr=countvectorizer_brand.transform(Train_Data_events['phone_brand'])
x_brand_te=countvectorizer_brand.transform(Test_Data_events['phone_brand'])
print('x_brand_tr shape:',x_brand_tr.shape)
print('x_brand_te shape:',x_brand_te.shape)

#2.Processing Phone Brand feature:
countvectorizer_model=CountVectorizer()
countvectorizer_model.fit(phone_brand_device_model['device_model'])
x_model_tr=countvectorizer_model.transform(Train_Data_events['device_model'])
x_model_te=countvectorizer_model.transform(Test_Data_events['device_model'])
print('\nx_model_tr shape:',x_model_tr.shape)
print('x_model_te shape:',x_model_te.shape)

#3.Processing Target Group feature(Age&Gender of User):
labelencoder_grp=LabelEncoder()
labelencoder_grp.fit(Train_Data_events['group'])
Train_Data_events['group']=labelencoder_grp.transform(Train_Data_events['group'])

#4.Processing app_id feature:
countvectorizer_app_id=CountVectorizer()
countvectorizer_app_id.fit(Train_Data_events.app_id)
x_appid_tr=countvectorizer_app_id.transform(Train_Data_events.app_id)
x_appid_te=countvectorizer_app_id.transform(Test_Data_events.app_id)
print('\nx_appid_tr shape:',x_appid_tr.shape)
print('x_appid_te shape:',x_appid_te.shape)

#5.Processing app_label_id feature:
countvectorizer_label_id=CountVectorizer()
countvectorizer_label_id.fit(Train_Data_events.app_label_id)
x_label_id_tr=countvectorizer_label_id.transform(Train_Data_events.app_label_id)
x_label_id_te=countvectorizer_label_id.transform(Test_Data_events.app_label_id)
print('\nx_label_id_tr shape:',x_label_id_tr.shape)
print('x_label_id_te shape:',x_label_id_te.shape)

#6.Processing day feature:
countvectorizer_day=CountVectorizer()
countvectorizer_day.fit(Train_Data_events.Day)
x_day_tr=countvectorizer_day.transform(Train_Data_events.Day)
x_day_te=countvectorizer_day.transform(Test_Data_events.Day)
print('\nx_day_tr shape:',x_day_tr.shape)
print('x_day_te shape:',x_day_te.shape)

#7.Processing Hour feature:
countvectorizer_day=CountVectorizer()
countvectorizer_day.fit(Train_Data_events.Hour)
x_hour_tr=countvectorizer_day.transform(Train_Data_events.Hour)
x_hour_te=countvectorizer_day.transform(Test_Data_events.Hour)
print('\nx_hour_tr shape:',x_hour_tr.shape)
print('x_hour_te shape:',x_hour_te.shape)

#8.Processing Latitude feature:
std_scaler_lat=StandardScaler()
std_scaler_lat.fit(Train_Data_events['latitude'].values.reshape(-1,1))
X_event_lat_tr = std_scaler_lat.transform(Train_Data_events['latitude'].values.reshape(-1,1))
X_event_lat_te = std_scaler_lat.transform(Test_Data_events['latitude'].values.reshape(-1,1))
print('\nX_event_lat_tr shape:',X_event_lat_tr.shape)
print('X_event_lat_te shape:',X_event_lat_te.shape)

#9.Processing Longitude feature:
std_scaler_long=StandardScaler()
std_scaler_long.fit(Train_Data_events['longitude'].values.reshape(-1,1))
X_event_long_tr = std_scaler_long.transform(Train_Data_events['longitude'].values.reshape(-1,1))
X_event_long_te = std_scaler_long.transform(Test_Data_events['longitude'].values.reshape(-1,1))
print('\nX_event_long_tr shape:',X_event_long_tr.shape)
print('X_event_long_te shape:',X_event_long_te.shape)

#10.Processing Missing Location feature:
scaler=StandardScaler()
x_ml_tr=scaler.fit_transform(Train_Data_events.missing_location.values.reshape(-1,1))
x_ml_te=scaler.transform(Test_Data_events.missing_location.values.reshape(-1,1))
print('\nx_ml_tr shape:',x_ml_tr.shape)
print('x_ml_te shape:',x_ml_te.shape)


#11.Stacking all of the all type of features:

X_tr_events = hstack((x_brand_tr,x_model_tr,x_appid_tr,x_label_id_tr,x_hour_tr,x_day_tr,X_event_lat_tr,X_event_long_tr,x_ml_tr))
y_tr_events = np.array(Train_Data_events['group'])
y_tr_events=sparse.csr_matrix(y_tr_events)
X_te_events = hstack((x_brand_te,x_model_te,x_appid_te,x_label_id_te,x_hour_te,x_day_te,X_event_lat_te,X_event_long_te,x_ml_te))
print('\nX_tr_events shape:',X_tr_events.shape)
print('y_tr_events shape:',y_tr_events.shape)
print('X_te_events shape:',X_te_events.shape)

x_brand_tr shape: (23290, 131)
x_brand_te shape: (35172, 131)

x_model_tr shape: (23290, 1440)
x_model_te shape: (35172, 1440)

x_appid_tr shape: (23290, 13762)
x_appid_te shape: (35172, 13762)

x_label_id_tr shape: (23290, 303)
x_label_id_te shape: (35172, 303)

x_day_tr shape: (23290, 1)
x_day_te shape: (35172, 1)

x_hour_tr shape: (23290, 14)
x_hour_te shape: (35172, 14)

X_event_lat_tr shape: (23290, 1)
X_event_lat_te shape: (35172, 1)

X_event_long_tr shape: (23290, 1)
X_event_long_te shape: (35172, 1)

x_ml_tr shape: (23290, 1)
x_ml_te shape: (35172, 1)

X_tr_events shape: (23290, 15654)
y_tr_events shape: (1, 23290)
X_te_events shape: (35172, 15654)


In [None]:
#Saving the Data with noevents to drive:
sparse.save_npz('/content/drive/MyDrive/TalkingData/Processed_Data/X_tr_events.npz', X_tr_events)
sparse.save_npz('/content/drive/MyDrive/TalkingData/Processed_Data/X_te_events.npz', X_te_events)
sparse.save_npz('/content/drive/MyDrive/TalkingData/Processed_Data/y_tr_events.npz', y_tr_events)
gender_age_test_Events.to_csv('/content/drive/MyDrive/TalkingData/Processed_Data/gender_age_test_Events.csv')
#Loading:

#X_tr_events=sparse.load_npz('/content/drive/MyDrive/TalkingData/Processed_Data/X_tr_events.npz')
#X_te_events=sparse.load_npz('/content/drive/MyDrive/TalkingData/Processed_Data/X_te_events.npz')
#y_tr_events=sparse.load_npz('/content/drive/MyDrive/TalkingData/Processed_Data/y_tr_events.npz')
#y_tr_events=sparse.csr_matrix.toarray(y_tr_noevents)[0]
