# EI ST4

## Imports

In [1]:
import pandas as pd
from datetime import datetime
import math

In [2]:
df = pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_train.csv")

## Cleaning up the dataframe

In [3]:
df

Unnamed: 0,userId,age,gender,registrationDate,languageCode,countryCode,emailContactable,postalContactable,postalCode,numberChildren,...,detergentType,MrPropreTrier,AntikalTrier,ArielTrier,DashTrier,scentLover,petOwner,ecoInterest,closestShop,washDishes
0,use_XbA1FTDcCrTMNTHK1851TzjyPMP,24.0,F,2019-10-14T17:33:37Z,fr,FRA,True,True,68540,1,...,Liquid,,,,,NonUser,Yes,,,Auto
1,use_3WHgsMVGSg5MHG2zja91TzdfmY2,67.0,F,2017-07-25T07:00:31Z,fr,FRA,True,True,17390,3,...,Pods,Known Trier,Known Trier,Known Trier,,,No,High,,Auto
2,use_2SisOiR8QwDaHI4svm11TzcvK7V,45.0,F,2015-06-25T00:00:00Z,fr,FRA,True,True,30200,1,...,Liquid,,,Known Trier,Known Trier,NonUser,Yes,High,,Auto
3,use_W0T1LmfKaQPJYD1RTWh1Tzdc69H,35.0,F,2018-01-31T07:24:39Z,fr,FRA,True,True,61400,2,...,Pods,,,Known Trier,Known Trier,Low,Yes,Medium,,Hand
4,use_KwYWZ7UMl4veveOaPGK1TzddtmI,38.0,F,2016-01-26T11:42:17Z,fr,FRA,True,True,30620,0,...,Liquid,Known Trier,Known Trier,Known Trier,Known Trier,,Yes,Low,,Hand
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,use_ICSyEvCcRNPpQeBdWcM1TzdczV8,53.0,F,2015-03-11T20:03:16Z,fr,FRA,True,True,77240,4+,...,Pods,,,,,,Yes,,,Auto
11996,use_Q5DNYeSmG7g6VeMV4n71TzeTRRz,43.0,F,2015-10-10T19:16:19Z,fr,FRA,True,True,45120,0,...,Liquid,,,,,Low,Yes,Medium,,Hand
11997,use_rEoce5pc0MlKpndfzNj1Tze3J1L,42.0,F,2015-05-08T02:59:34Z,fr,FRA,True,True,35170,3,...,Liquid,,,,,,Yes,High,,Hand
11998,use_0AuDL2wOJHy9I16zSC21TzeHRMo,39.0,M,2019-05-20T11:59:02Z,fr,FRA,True,True,45160,1,...,Liquid,,,,,,Yes,,,Auto


Get unique count for each variable

In [4]:
df.nunique()

userId               12000
age                     74
gender                   2
registrationDate      9582
languageCode             2
countryCode              1
emailContactable         2
postalContactable        2
postalCode            3748
numberChildren          10
lastActivityDate     11612
reactivationValue        5
toothBrushType           2
detergentType            3
MrPropreTrier            1
AntikalTrier             1
ArielTrier               1
DashTrier                1
scentLover               4
petOwner                 4
ecoInterest              4
closestShop              6
washDishes               2
dtype: int64

Check variable data type

In [5]:
df.dtypes

userId                object
age                  float64
gender                object
registrationDate      object
languageCode          object
countryCode           object
emailContactable      object
postalContactable     object
postalCode            object
numberChildren        object
lastActivityDate      object
reactivationValue     object
toothBrushType        object
detergentType         object
MrPropreTrier         object
AntikalTrier          object
ArielTrier            object
DashTrier             object
scentLover            object
petOwner              object
ecoInterest           object
closestShop           object
washDishes            object
dtype: object

We can remove the `languageCode` and `countryCode` column as they are constant

In [6]:
df[~df.languageCode.isin(['fr', 'FR'])].empty and df[~df.countryCode.eq('FRA')].empty

False

In [7]:
df = df.drop(columns=['languageCode', 'countryCode'])

We will also remove the `registrationDate`, `reactivationValue`, `emailContactable` and `postalContactable` as they are irrelevant

In [8]:
df = df.drop(columns=['registrationDate', 'reactivationValue', 'emailContactable', 'postalContactable'])

We are going to replace the `washDishes` `STRING` column with an `INT` column

In [9]:
df['ownsDishwasher'] = df.apply(lambda row: 1 if row['washDishes'] == 'Auto' else -1, axis=1)
df = df.drop(columns=["washDishes"])

In the `MrPropreTrier`, `AntikalTrier`, `ArielTrier`, `DashTrier` we will replace `Known Trier` with `1` and `Nan` with `-1`

In [10]:
for c in ['MrPropreTrier', 'AntikalTrier', 'ArielTrier', 'DashTrier']:
    df[c[:-5]] = df.apply(lambda row: 1 if row[c] == 'Known Trier' else -1, axis=1)
    df = df.drop(columns=[c])

We will replace `detergentType` by `liquid`, `pods` and `powder` 

In [11]:
df['liquidDetergent'] = df.apply(lambda row: 1 if row['detergentType'] == 'Liquid' else -1, axis=1)
df = df.drop(columns=['detergentType'])

Replacing `toothBrushType` with `electricToothbrush`

In [12]:
df['electricToothbrush'] = df.apply(lambda row: 1 if row['toothBrushType'] == 'Electric' else -1, axis=1)
df = df.drop(columns=['toothBrushType'])

Replacing `petOwner` with `hasPet`, if the first 3 characters are 'Yes' or 'Oui' the value is `1`, else it is `-1`

In [13]:
df['hasPet'] = df.apply(lambda row: 1 if row['petOwner'][:3] == 'Yes' or row['petOwner'][:3] == 'Oui' else -1, axis=1)
df = df.drop(columns=['petOwner'])

Transforms a `ISO 8601` time string to the amount of days since the time string

In [14]:
def timeStringToDelta(timeString: str) -> int:
    return int((datetime.utcnow().timestamp() - datetime.strptime(timeString[:10], "%Y-%m-%d").timestamp()) / 3600 / 24)

Replaces `lastActivityDate` with `daysSinceActivity`: an `integer` representing the amount of ellapsed days since last activity

In [15]:
df['daysSinceActivity'] = df.apply(lambda row: timeStringToDelta(row['lastActivityDate']), axis=1)
df = df.drop(columns=['lastActivityDate'])

Replaces `age` with a normalized column

In [16]:
df

0        0.077922
1        0.636364
2        0.350649
3        0.220779
4        0.259740
           ...   
11995    0.454545
11996    0.324675
11997    0.311688
11998    0.272727
11999    0.181818
Name: age, Length: 12000, dtype: float64

The possible values for `numberChildren` are `'1'`, `'2'`, `'3'`, `'4'`, `'4+'` and `'NaN'`. We will be creating a column `nbChildren` of type `int` where `'NaN'` will be mapped to `None`.

In [17]:
def childrenMagik(children: str):
    if str(children).lower() in 'nan': return None
    if children == '4+': return 5
    return int(children)

df['nbChildren'] = df.apply(lambda row: childrenMagik(row['numberChildren']), axis=1)
df = df.drop(columns=['numberChildren'])

Replaces `F` with `1` and `M` with `-1` in the `gender` column

In [18]:
df

Unnamed: 0,userId,age,gender,postalCode,scentLover,ecoInterest,closestShop,ownsDishwasher,MrPropre,Antikal,Ariel,Dash,liquidDetergent,electricToothbrush,hasPet,daysSinceActivity,nbChildren
0,use_XbA1FTDcCrTMNTHK1851TzjyPMP,24,F,68540,NonUser,,,1,-1,-1,-1,-1,1,-1,1,308,1.0
1,use_3WHgsMVGSg5MHG2zja91TzdfmY2,67,F,17390,,High,,1,1,1,1,-1,-1,-1,-1,28,3.0
2,use_2SisOiR8QwDaHI4svm11TzcvK7V,45,F,30200,NonUser,High,,1,-1,-1,1,1,1,-1,1,130,1.0
3,use_W0T1LmfKaQPJYD1RTWh1Tzdc69H,35,F,61400,Low,Medium,,-1,-1,-1,1,1,-1,-1,1,340,2.0
4,use_KwYWZ7UMl4veveOaPGK1TzddtmI,38,F,30620,,Low,,-1,1,1,1,1,1,1,1,158,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,use_ICSyEvCcRNPpQeBdWcM1TzdczV8,53,F,77240,,,,1,-1,-1,-1,-1,-1,-1,1,10,5.0
11996,use_Q5DNYeSmG7g6VeMV4n71TzeTRRz,43,F,45120,Low,Medium,,-1,-1,-1,-1,-1,1,1,1,167,0.0
11997,use_rEoce5pc0MlKpndfzNj1Tze3J1L,42,F,35170,,High,,-1,-1,-1,-1,-1,1,-1,1,33,3.0
11998,use_0AuDL2wOJHy9I16zSC21TzeHRMo,39,M,45160,,,,1,-1,-1,-1,-1,1,1,1,260,1.0


Changes `ecoInterest` replacing `High`, `Medium`, `Low`, `Very high` with a scale going from `0` to `1`. We the replace `nan` with the avg.

In [19]:
df['nbChildren'].mean()

1.3804149665769598

Changes `scentLover` replacing `NonUser`, `Low`, `Medium`, `High` with a scale going from 0 to 1. We the replace `nan` with the avg.

In [20]:
def textScaleToFloat(s:str) -> float:
    if s == 'High': return 1
    if s == 'Medium': return 2 / 3
    if s == 'Low': return 1 / 3
    if s == 'NonUser': return 0
    return None

# replaces the 'normal' values with floats
df['scentLover'] = df['scentLover'].apply(textScaleToFloat)

# replaces the nan with the avg
df['scentLover'] = df['scentLover'].fillna(df['scentLover'].mean(skipna=True))

Handles the closest shop

In [27]:
df['magasin']           = df['closestShop'].apply(lambda e: 1 if e =='1. Magasin de Proximit�' else -1)
df['moyenneSurface']    = df['closestShop'].apply(lambda e: 1 if e =='2. Moyenne Surface' else -1)
df['superMarket']       = df['closestShop'].apply(lambda e: 1 if e =='3. SuperMarket' else -1)
df['hyperMarket']       = df['closestShop'].apply(lambda e: 1 if e =='4. HyperMarket' else -1)
df['drive']             = df['closestShop'].apply(lambda e: 1 if e =='5. Drive' else -1)
df['hardDiscount']      = df['closestShop'].apply(lambda e: 1 if e =='6. Hard Discounter' else -1)

# removes extra column
df=df.drop(columns=['closestShop'])

Unnamed: 0,userId,age,gender,postalCode,scentLover,ecoInterest,ownsDishwasher,MrPropre,Antikal,Ariel,...,electricToothbrush,hasPet,daysSinceActivity,nbChildren,magasin,moyenneSurgace,superMarket,hyperMarket,drive,hardDiscount
0,use_XbA1FTDcCrTMNTHK1851TzjyPMP,24,F,68540,0.000000,,1,-1,-1,-1,...,-1,1,308,1.0,-1,-1,-1,-1,-1,-1
1,use_3WHgsMVGSg5MHG2zja91TzdfmY2,67,F,17390,0.378981,High,1,1,1,1,...,-1,-1,28,3.0,-1,-1,-1,-1,-1,-1
2,use_2SisOiR8QwDaHI4svm11TzcvK7V,45,F,30200,0.000000,High,1,-1,-1,1,...,-1,1,130,1.0,-1,-1,-1,-1,-1,-1
3,use_W0T1LmfKaQPJYD1RTWh1Tzdc69H,35,F,61400,0.333333,Medium,-1,-1,-1,1,...,-1,1,340,2.0,-1,-1,-1,-1,-1,-1
4,use_KwYWZ7UMl4veveOaPGK1TzddtmI,38,F,30620,0.378981,Low,-1,1,1,1,...,1,1,158,0.0,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,use_ICSyEvCcRNPpQeBdWcM1TzdczV8,53,F,77240,0.378981,,1,-1,-1,-1,...,-1,1,10,5.0,-1,-1,-1,-1,-1,-1
11996,use_Q5DNYeSmG7g6VeMV4n71TzeTRRz,43,F,45120,0.333333,Medium,-1,-1,-1,-1,...,1,1,167,0.0,-1,-1,-1,-1,-1,-1
11997,use_rEoce5pc0MlKpndfzNj1Tze3J1L,42,F,35170,0.378981,High,-1,-1,-1,-1,...,-1,1,33,3.0,-1,-1,-1,-1,-1,-1
11998,use_0AuDL2wOJHy9I16zSC21TzeHRMo,39,M,45160,0.378981,,1,-1,-1,-1,...,1,1,260,1.0,-1,-1,-1,-1,-1,-1


In [28]:
df

Unnamed: 0,age,gender,scentLover,ecoInterest,washDishes,MrPropre,Antikal,Ariel,Dash,pods,...,hasPet,daysSinceActivity,nbChildren,magasin,moyenneSurgace,superMarket,hyperMarket,drive,hardDiscount,moyenneSurface
0,0.077922,1,0.000000,0.387479,1,-1,-1,-1,-1,-1,...,1,0.234694,0.2,-1,-1,-1,-1,-1,-1,-1
1,0.636364,1,0.378258,0.666667,1,1,1,1,-1,1,...,-1,0.014914,0.6,-1,-1,-1,-1,-1,-1,-1
2,0.350649,1,0.000000,0.666667,1,-1,-1,1,1,-1,...,1,0.094976,0.2,-1,-1,-1,-1,-1,-1,-1
3,0.220779,1,0.333333,0.333333,-1,-1,-1,1,1,1,...,1,0.259812,0.4,-1,-1,-1,-1,-1,-1,-1
4,0.259740,1,0.378258,0.000000,-1,1,1,1,1,-1,...,1,0.116954,0.0,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0.454545,1,0.378258,0.387479,1,-1,-1,-1,-1,1,...,1,0.000785,1.0,-1,-1,-1,-1,-1,-1,-1
11996,0.324675,1,0.333333,0.333333,-1,-1,-1,-1,-1,-1,...,1,0.124019,0.0,-1,-1,-1,-1,-1,-1,-1
11997,0.311688,1,0.378258,0.666667,-1,-1,-1,-1,-1,-1,...,1,0.018838,0.6,-1,-1,-1,-1,-1,-1,-1
11998,0.272727,-1,0.378258,0.387479,1,-1,-1,-1,-1,-1,...,1,0.197017,0.2,-1,-1,-1,-1,-1,-1,-1


In order to have more information about customers, we are going to add features, using the data set "consumer actions"


In [23]:
#df_consumer_actions = pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_consumeractions.csv")

In [24]:
#df_consumer_actions

Unnamed: 0,userId,event,originalTimestamp,brandName,brandName2,subjectLine,url,keyword,couponRedemptionDate
0,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,Product Reviewed,2021-05-13 22:57:44.182000+00:00,Fairy PEPS Platinum+ Tout-en-1,,Fairy PEPS Platinum+ Tout-en-1 | Envie de Plus,https://www.enviedeplus.com/marques/fairy/fair...,,
1,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,Email Clicked,2021-05-06 20:36:14+00:00,,,Lenor vous fait une fleur <%${user['FirstName'...,https://www.enviedeplus.com/50-euros-chez-truf...,,
2,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,Email Opened,2020-09-17 19:35:13+00:00,,,Ariel Pods x Whirlpool = 💝,,,
3,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,Email Opened,2020-10-23 06:18:35+00:00,,,"Bye-bye la grisaille, bonjour la couleur 🍂",,,
4,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,Email Opened,2021-05-10 19:50:41+00:00,,,Il faut qu’on parle <%${user['FirstName']}%>,,,
...,...,...,...,...,...,...,...,...,...
1172500,use_sLe3AqB7h7jlypX6JmC1TzdphFJ,Email Opened,2020-12-01 12:19:38+00:00,,,Jusqu'à 85 € de réductions* avec les Jours en ...,,,
1172501,use_fLS7y8W7sAXSCohYlWi1TzdXPjQ,Email Opened,2020-08-04 07:46:36+00:00,,,Envie de... faire une bonne action ? 🤲,,,
1172502,use_YyWKhnJXM3bUmsc66TA1TzdcFXm,Email Opened,2020-11-05 15:49:48+00:00,,,Votre surprise pour une peau nette 🎁,,,
1172503,use_8th6Vk4n1QZDcXwBfU41TzdfDDF,Email Opened,2020-12-04 10:02:03+00:00,,,🖤 Profitez de vos marques préférées avec le Bl...,,,


In [25]:

text = df_consumer_actions['brandName'][0]
df_consumer_actions['brandName'] = df_consumer_actions['brandName']== text
df_consumer_actions['brandName2'] = df_consumer_actions['brandName2']== text
df_consumer_actions['brand'] = df_consumer_actions['brandName'] | df_consumer_actions['brandName2']
df_consumer_actions =  df_consumer_actions.drop(columns=['event', 'originalTimestamp', 'brandName', 'brandName2', 'subjectLine', 'url', 'keyword','couponRedemptionDate'])
df_consumer_actions['brand'] = df_consumer_actions.apply(lambda row: 1 if row['brand'] else -1, axis=1)
df_consumer_actions

Unnamed: 0,userId,brand
0,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,1
1,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,-1
2,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,-1
3,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,-1
4,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,-1
...,...,...
1172500,use_sLe3AqB7h7jlypX6JmC1TzdphFJ,-1
1172501,use_fLS7y8W7sAXSCohYlWi1TzdXPjQ,-1
1172502,use_YyWKhnJXM3bUmsc66TA1TzdcFXm,-1
1172503,use_8th6Vk4n1QZDcXwBfU41TzdfDDF,-1


In [26]:
df_merged = pd.merge(df,df_consumer_actions, how='left')


In [27]:
df_merged = df_merged.rename(columns={"brand": "Interested_email"})
df = df_merged

In [28]:
df

Unnamed: 0,userId,age,gender,postalCode,scentLover,ecoInterest,closestShop,ownsDishwasher,MrPropre,Antikal,...,hasPet,daysSinceActivity,nbChildren,magasin,moyenneSurgace,superMarket,hyperMarket,drive,hardDiscount,Interested_email
0,use_XbA1FTDcCrTMNTHK1851TzjyPMP,24,F,68540,0.0,,,1,-1,-1,...,1,308,1.0,-1,-1,-1,-1,-1,-1,-1.0
1,use_XbA1FTDcCrTMNTHK1851TzjyPMP,24,F,68540,0.0,,,1,-1,-1,...,1,308,1.0,-1,-1,-1,-1,-1,-1,-1.0
2,use_XbA1FTDcCrTMNTHK1851TzjyPMP,24,F,68540,0.0,,,1,-1,-1,...,1,308,1.0,-1,-1,-1,-1,-1,-1,-1.0
3,use_XbA1FTDcCrTMNTHK1851TzjyPMP,24,F,68540,0.0,,,1,-1,-1,...,1,308,1.0,-1,-1,-1,-1,-1,-1,-1.0
4,use_XbA1FTDcCrTMNTHK1851TzjyPMP,24,F,68540,0.0,,,1,-1,-1,...,1,308,1.0,-1,-1,-1,-1,-1,-1,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
889075,use_B7F2PfU4eOKBmVc93AU1TzdqFY9,32,F,37600,0.0,High,3. SuperMarket,-1,-1,-1,...,1,290,3.0,-1,-1,1,-1,-1,-1,-1.0
889076,use_B7F2PfU4eOKBmVc93AU1TzdqFY9,32,F,37600,0.0,High,3. SuperMarket,-1,-1,-1,...,1,290,3.0,-1,-1,1,-1,-1,-1,-1.0
889077,use_B7F2PfU4eOKBmVc93AU1TzdqFY9,32,F,37600,0.0,High,3. SuperMarket,-1,-1,-1,...,1,290,3.0,-1,-1,1,-1,-1,-1,-1.0
889078,use_B7F2PfU4eOKBmVc93AU1TzdqFY9,32,F,37600,0.0,High,3. SuperMarket,-1,-1,-1,...,1,290,3.0,-1,-1,1,-1,-1,-1,-1.0
