In [None]:
#Importing Libraries
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import os as os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
from scipy.sparse import hstack
from scipy.sparse import csc_matrix,csr_matrix
import scipy.sparse

In [None]:
#Loading all the files:
datadir='/content/drive/MyDrive/TalkingData/'
gender_age_train = pd.read_csv(os.path.join(datadir,'gender_age_train.csv')).drop(['gender','age'],axis=1)
gender_age_test = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'))
phone_brand_device_model = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv')).drop_duplicates('device_id',keep='first')
events_data = pd.read_csv(os.path.join(datadir,'events.csv'))
apps_events_data = pd.read_csv(os.path.join(datadir,'app_events.csv'), usecols=['event_id','app_id','is_active'],dtype={'is_active':bool})
apps_label_data = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
label_categories_data = pd.read_csv(os.path.join(datadir,'label_categories.csv'))

In [None]:
gender_age_train.head()

Unnamed: 0,device_id,group
0,-8076087639492063270,M32-38
1,-2897161552818060146,M32-38
2,-8260683887967679142,M32-38
3,-4938849341048082022,M29-31
4,245133531816851882,M29-31


In [None]:
gender_age_test.head()

Unnamed: 0,device_id
0,1002079943728939269
1,-1547860181818787117
2,7374582448058474277
3,-6220210354783429585
4,-5893464122623104785


#Preprocessing:

In [None]:
#Preprocessing-->Merging features to train data

#Task-1:
#All the DeviceIds has the features Phone brand & Phone brand data
#So we shall the merge the features to all the given Train & Test Data

gender_age_train=gender_age_train.merge(phone_brand_device_model,on='device_id') 
gender_age_test=gender_age_test.merge(phone_brand_device_model,on='device_id') 

#Task-2:
#Collecting Device Ids which has the Events Data and storing it in a list:
DeviceID_Appdata=apps_events_data
evdata_dict=pd.Series(events_data.device_id.values,index=events_data.event_id).to_dict()
DeviceID_Appdata['event_id']=DeviceID_Appdata['event_id'].map(evdata_dict)
DeviceID_Appdata=DeviceID_Appdata.rename(columns={'event_id':'device_id'})
Device_ids_with_Events=DeviceID_Appdata.device_id.unique()

#Task-3:
#Creating new columns 'has events'=1/0 which identifies if the Device has the events or not
#this feature is also used to train the data without events

gender_age_train['has_events']=gender_age_train.device_id.apply(lambda x:1 if x in Device_ids_with_Events else 0)  
gender_age_test['has_events']=gender_age_test.device_id.apply(lambda x:1 if x in Device_ids_with_Events else 0)  
#segregating the test data with noevents
gender_age_test_Noevents=gender_age_test.loc[gender_age_test['has_events']==0]
gender_age_test_Events=gender_age_test.loc[gender_age_test['has_events']==1]
#Notes:
#1.We use all the train data with phone brand & model features for training the models to predict test data without events
#2.While Handling we use only the train data which has events data for training  to predict the test data which has events
#3.The Train & Test Data gets filtered out into Data with Events when we are merging the event data to th Actual Training Dataset
#So no need of splitting the data in special.

In [None]:
gender_age_train.head()

Unnamed: 0,device_id,group,phone_brand,device_model,has_events
0,-8076087639492063270,M32-38,小米,MI 2,0
1,-2897161552818060146,M32-38,小米,MI 2,0
2,-8260683887967679142,M32-38,小米,MI 2,1
3,-4938849341048082022,M29-31,小米,红米note,0
4,245133531816851882,M29-31,小米,MI 3,0


In [None]:
gender_age_test.head()

Unnamed: 0,device_id,phone_brand,device_model,has_events
0,1002079943728939269,小米,小米note,1
1,-1547860181818787117,小米,红米2,1
2,7374582448058474277,华为,Y523-L176,1
3,-6220210354783429585,华为,荣耀6,1
4,-5893464122623104785,小米,MI 2,0


In [None]:
gender_age_test_Noevents.head()

Unnamed: 0,device_id,phone_brand,device_model,has_events
4,-5893464122623104785,小米,MI 2,0
5,-7560708697029818408,小米,MI 2,0
6,289797889702373958,小米,红米note,0
7,-402874006399730161,小米,红米note,0
8,5751283639860028129,三星,Galaxy S4,0


In [None]:
#Preprocessing-->Merging features to train data

#Task-1:
#All the DeviceIds has the features Phone brand & Phone brand data
#So we shall the merge the features to all the given Train & Test Data

gender_age_train=gender_age_train.merge(phone_brand_device_model,on='device_id') 
gender_age_test=gender_age_test.merge(phone_brand_device_model,on='device_id') 

#Task-2:
#Collecting Device Ids which has the Events Data and storing it in a list:
DeviceID_Appdata=apps_events_data
evdata_dict=pd.Series(events_data.device_id.values,index=events_data.event_id).to_dict()
DeviceID_Appdata['event_id']=DeviceID_Appdata['event_id'].map(evdata_dict)
DeviceID_Appdata=DeviceID_Appdata.rename(columns={'event_id':'device_id'})
Device_ids_with_Events=DeviceID_Appdata.device_id.unique()

#Task-3:
#Creating new columns 'has events'=1/0 which identifies if the Device has the events or not
#this feature is also used to train the data without events

gender_age_train['has_events']=gender_age_train.device_id.apply(lambda x:1 if x in Device_ids_with_Events else 0)  
gender_age_test['has_events']=gender_age_test.device_id.apply(lambda x:1 if x in Device_ids_with_Events else 0)  
#segregating the test data with noevents
gender_age_test_Noevents=gender_age_test.loc[gender_age_test['has_events']==0]
gender_age_test_Events=gender_age_test.loc[gender_age_test['has_events']==1]
#Notes:
#1.We use all the train data with phone brand & model features for training the models to predict test data without events
#2.While Handling we use only the train data which has events data for training  to predict the test data which has events
#3.The Train & Test Data gets filtered out into Data with Events when we are merging the event data to th Actual Training Dataset
#So no need of splitting the data in special.


#Processing features of Train and test Data with NoEvents :
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
#Note:
#Here we are using the Countervectorizer to label the category data phone brand and model and put it in sparse matrix.
#Which makes us to have 131 features of Phone brand & 1440 features of Phone Model for training teh models

#1.Processing Phone Brand feature:
countvectorizer_brand=CountVectorizer()
countvectorizer_brand.fit(phone_brand_device_model['phone_brand'])
x_brand_tr=countvectorizer_brand.transform(gender_age_train['phone_brand'])
x_brand_te=countvectorizer_brand.transform(gender_age_test_Noevents['phone_brand'])
print('x_brand_tr shape:',x_brand_tr.shape)
print('x_brand_te shape:',x_brand_te.shape)

#2.Processing Phone Brand feature:
countvectorizer_model=CountVectorizer()
countvectorizer_model.fit(phone_brand_device_model['device_model'])
x_model_tr=countvectorizer_model.transform(gender_age_train['device_model'])
x_model_te=countvectorizer_model.transform(gender_age_test_Noevents['device_model'])
print('\nx_model_tr shape:',x_model_tr.shape)
print('x_model_te shape:',x_model_te.shape)

#3.Processing Target Group feature(Age&Gender of User):
labelencoder_grp=LabelEncoder()
labelencoder_grp.fit(gender_age_train['group'])
gender_age_train['group']=labelencoder_grp.transform(gender_age_train['group'])

#4.Processing has_events feature:
x_hasevents_tr=sparse.csr_matrix(gender_age_train['has_events'].values).reshape(-1,1)
x_hasevents_te=sparse.csr_matrix(gender_age_test_Noevents['has_events'].values).reshape(-1,1)
print('\nx_hasevents_tr shape:',x_hasevents_tr.shape)
print('x_hasevents_te shape:',x_hasevents_te.shape)

#5.Stacking all of the 3 type of features:
X_tr_noevents = hstack((x_brand_tr,x_model_tr,x_hasevents_tr))
y_tr_noevents = np.array(gender_age_train['group'])
y_tr_noevents=sparse.csr_matrix(y_tr_noevents)
X_te_noevents = hstack((x_brand_te,x_model_te,x_hasevents_te))
print('\nX_tr_noevents shape:',X_tr_noevents.shape)
print('y_tr_noevents shape:',y_tr_noevents.shape)
print('X_te_noevents shape:',X_te_noevents.shape)


x_brand_tr shape: (74645, 131)
x_brand_te shape: (76899, 131)

x_model_tr shape: (74645, 1440)
x_model_te shape: (76899, 1440)

x_hasevents_tr shape: (74645, 1)
x_hasevents_te shape: (76899, 1)

X_tr_noevents shape: (74645, 1572)
y_tr_noevents shape: (1, 74645)
X_te_noevents shape: (76899, 1572)


In [None]:
#Saving the Data with noevents to drive:
sparse.save_npz('/content/drive/MyDrive/TalkingData/Processed_Data/X_tr_noevents.npz', X_tr_noevents)
sparse.save_npz('/content/drive/MyDrive/TalkingData/Processed_Data/X_te_noevents.npz', X_te_noevents)
sparse.save_npz('/content/drive/MyDrive/TalkingData/Processed_Data/y_tr_noevents.npz', y_tr_noevents)
gender_age_test_Noevents.to_csv('/content/drive/MyDrive/TalkingData/Processed_Data/gender_age_test_Noevents.csv')
#Loading:

#X_tr_noevents=sparse.load_npz('/content/drive/MyDrive/TalkingData/Processed_Data/X_tr_noevents.npz')
#X_te_noevents=sparse.load_npz('/content/drive/MyDrive/TalkingData/Processed_Data/X_te_noevents.npz')
#y_tr_noevents=sparse.load_npz('/content/drive/MyDrive/TalkingData/Processed_Data/y_tr_noevents.npz')
#y_tr_noevents=sparse.csr_matrix.toarray(y_tr_noevents)[0]