# EI ST4

## Imports

In [37]:
import pandas as pd
from datetime import datetime
import math

In [38]:
df = pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_train.csv")

## Cleaning up the dataframe

In [39]:
df

Unnamed: 0,userId,age,gender,registrationDate,languageCode,countryCode,emailContactable,postalContactable,postalCode,numberChildren,...,detergentType,MrPropreTrier,AntikalTrier,ArielTrier,DashTrier,scentLover,petOwner,ecoInterest,closestShop,washDishes
0,use_XbA1FTDcCrTMNTHK1851TzjyPMP,24,F,2019-10-14T17:33:37Z,fr,FRA,True,True,68540,1,...,Liquid,,,,,NonUser,Yes,,,Auto
1,use_3WHgsMVGSg5MHG2zja91TzdfmY2,67,F,2017-07-25T07:00:31Z,fr,FRA,True,True,17390,3,...,Pods,Known Trier,Known Trier,Known Trier,,,No,High,,Auto
2,use_2SisOiR8QwDaHI4svm11TzcvK7V,45,F,2015-06-25T00:00:00Z,fr,FRA,True,True,30200,1,...,Liquid,,,Known Trier,Known Trier,NonUser,Yes,High,,Auto
3,use_W0T1LmfKaQPJYD1RTWh1Tzdc69H,35,F,2018-01-31T07:24:39Z,fr,FRA,True,True,61400,2,...,Pods,,,Known Trier,Known Trier,Low,Yes,Medium,,Hand
4,use_KwYWZ7UMl4veveOaPGK1TzddtmI,38,F,2016-01-26T11:42:17Z,fr,FRA,True,True,30620,0,...,Liquid,Known Trier,Known Trier,Known Trier,Known Trier,,Yes,Low,,Hand
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,use_ICSyEvCcRNPpQeBdWcM1TzdczV8,53,F,2015-03-11T20:03:16Z,fr,FRA,True,True,77240,4+,...,Pods,,,,,,Yes,,,Auto
11996,use_Q5DNYeSmG7g6VeMV4n71TzeTRRz,43,F,2015-10-10T19:16:19Z,fr,FRA,True,True,45120,0,...,Liquid,,,,,Low,Yes,Medium,,Hand
11997,use_rEoce5pc0MlKpndfzNj1Tze3J1L,42,F,2015-05-08T02:59:34Z,fr,FRA,True,True,35170,3,...,Liquid,,,,,,Yes,High,,Hand
11998,use_0AuDL2wOJHy9I16zSC21TzeHRMo,39,M,2019-05-20T11:59:02Z,fr,FRA,True,True,45160,1,...,Liquid,,,,,,Yes,,,Auto


Get unique count for each variable

In [40]:
df.nunique()

userId               12000
age                     74
gender                   2
registrationDate      9875
languageCode             2
countryCode              1
emailContactable         2
postalContactable        2
postalCode            3794
numberChildren          10
lastActivityDate     12000
reactivationValue        5
toothBrushType           2
detergentType           10
MrPropreTrier            1
AntikalTrier             1
ArielTrier               1
DashTrier                1
scentLover               4
petOwner                 9
ecoInterest              4
closestShop              6
washDishes               2
dtype: int64

Check variable data type

In [41]:
df.dtypes

userId               object
age                   int64
gender               object
registrationDate     object
languageCode         object
countryCode          object
emailContactable       bool
postalContactable      bool
postalCode           object
numberChildren       object
lastActivityDate     object
reactivationValue    object
toothBrushType       object
detergentType        object
MrPropreTrier        object
AntikalTrier         object
ArielTrier           object
DashTrier            object
scentLover           object
petOwner             object
ecoInterest          object
closestShop          object
washDishes           object
dtype: object

We can remove the `languageCode` and `countryCode` column as they are constant

In [42]:
df[~df.languageCode.isin(['fr', 'FR'])].empty and df[~df.countryCode.eq('FRA')].empty

True

In [43]:
df = df.drop(columns=['languageCode', 'countryCode'])

We will also remove the `registrationDate`, `reactivationValue`, `emailContactable` and `postalContactable` as they are irrelevant. `postalCode` as it will be to hard to analyse.

In [44]:
df = df.drop(columns=['registrationDate', 'reactivationValue', 'emailContactable', 'postalContactable', 'postalCode'])

We are going to replace the `washDishes` `STRING` column with an `INT` column

In [45]:
df["washDishes"] = df["washDishes"].apply(lambda e: 1 if e == 'Auto' else -1)

In the `MrPropreTrier`, `AntikalTrier`, `ArielTrier`, `DashTrier` we will replace `Known Trier` with `1` and `Nan` with `-1`

In [46]:
for c in ['MrPropreTrier', 'AntikalTrier', 'ArielTrier', 'DashTrier']:
    df[c[:-5]] = df.apply(lambda row: 1 if row[c] == 'Known Trier' else -1, axis=1)
    df = df.drop(columns=[c])

We will replace `detergentType` by `liquid`, `pods` and `powder` 

In [47]:
# pods
df['pods'] = df['detergentType'].apply(lambda e: 1 if pd.notna(e) and 'Pods' in e else -1)

# powder
df['powder'] = df['detergentType'].apply(lambda e: 1 if pd.notna(e) and 'Powder' in e else -1)

# liquid
df['liquid'] = df['detergentType'].apply(lambda e: 1 if pd.notna(e) and 'Liquid' in e else -1)

# removes extra column
df = df.drop(columns=['detergentType']) 

Replacing `toothBrushType` with `electricToothbrush`

In [48]:
df['electricToothbrush'] = df.apply(lambda row: 1 if row['toothBrushType'] == 'Electric' else -1, axis=1)
df = df.drop(columns=['toothBrushType'])

Replacing `petOwner` with `hasPet`, if the first 3 characters are 'Yes' or 'Oui' the value is `1`, else it is `-1`

In [49]:
def likesPets(s:str) -> float:
    if s in ['Yes, we love our furry friends', "Oui, j'adore nos petites boules de poils !"]: 
        return 1

    if s in ['Yes']: 
        return 0.9

    if s in ["Non, j'aime les animaux, mais je n'en ai pas pour le moment.", 'No, we love animals but no pets here for now']:   
        return 0.8
    
    if s in ['No - maybe future']:
        return 0.5

    if s in ['No']:
        return 0.3

    if s in ['Des animaux dans la maison ? Non merci !', "No, we'd never have pets in the house"]:
        return 0

# creating a new column for animal lovers
df['likesPets'] = df['petOwner'].apply(likesPets)

# creating a new column for pet owners
df['hasPet'] = df['petOwner'].apply(lambda e: 1 if str(e)[:3] == 'Yes' or str(e)[:3] == 'Oui' else -1)

df = df.drop(columns=['petOwner'])

Transforms a `ISO 8601` time string to the amount of days since the time string

In [50]:
def timeStringToDelta(timeString: str) -> int:
    if pd.isna(timeString): return None
    return int((datetime.utcnow().timestamp() - datetime.strptime(timeString[:10], "%Y-%m-%d").timestamp()) / 3600 / 24)

Replaces `lastActivityDate` with `daysSinceActivity`: an `integer` representing the amount of ellapsed days since last activity

In [51]:
df['daysSinceActivity'] = df['lastActivityDate'].apply(timeStringToDelta)

# normalizess the new column
minV = df['daysSinceActivity'].min()
maxV = df['daysSinceActivity'].max()
df['daysSinceActivity'] = df['daysSinceActivity'].apply(lambda e: (e - minV) / (maxV - minV))

# replaces Nan by the avg
df['daysSinceActivity'].fillna(df['daysSinceActivity'].mean(skipna=True))

# removes the extra column
df = df.drop(columns=['lastActivityDate'])

Replaces `age` with a normalized column

In [52]:
# normalizess the new column
minV = df['age'].min()
maxV = df['age'].max()
df['age'] = df['age'].apply(lambda e: (e - minV) / (maxV - minV))

# replaces Nan by the avg
df['age'].fillna(df['age'].mean(skipna=True))

0        0.077922
1        0.636364
2        0.350649
3        0.220779
4        0.259740
           ...   
11995    0.454545
11996    0.324675
11997    0.311688
11998    0.272727
11999    0.181818
Name: age, Length: 12000, dtype: float64

The possible values for `numberChildren` are `'1'`, `'2'`, `'3'`, `'4'`, `'4+'` and `'NaN'`. We will be creating a column `nbChildren` of type `int` where `'NaN'` will be mapped to `None`.

In [53]:
def childrenMagik(children: str) -> float:
    if children == '0': return 0
    if children == '1': return 1 / 5
    if children == '2': return 2 / 5
    if children == '3': return 3 / 5
    if children == '4': return 4 / 5
    if children == '4+': return 1
    return None

df['nbChildren'] = df['numberChildren'].apply(childrenMagik)

# Replacing nan with the avg
df['nbChildren'] = df['nbChildren'].fillna(df['nbChildren'].mean(skipna=True))

df = df.drop(columns=['numberChildren'])

Replaces `F` with `1` and `M` with `-1` in the `gender` column

In [54]:
df['gender'] = df['gender'].apply(lambda e: 1 if e == "F" else -1)

Changes `ecoInterest` replacing `High`, `Medium`, `Low`, `Very high` with a scale going from `0` to `1`. We the replace `nan` with the avg.

In [55]:
def textScaleToFloat(s:str) -> float:
    if s == 'Very high': return 1
    if s == 'High': return 2 / 3
    if s == 'Medium': return 1 / 3
    if s == 'Low': return 0
    return None

# replaces the 'normal' values with floats
df['ecoInterest'] = df['ecoInterest'].apply(textScaleToFloat)

# replaces the nan with the avg
df['ecoInterest'] = df['ecoInterest'].fillna(df['ecoInterest'].mean(skipna=True))

Changes `scentLover` replacing `NonUser`, `Low`, `Medium`, `High` with a scale going from 0 to 1. We the replace `nan` with the avg.

In [56]:
def textScaleToFloat(s:str) -> float:
    if s == 'High': return 1
    if s == 'Medium': return 2 / 3
    if s == 'Low': return 1 / 3
    if s == 'NonUser': return 0
    return None

# replaces the 'normal' values with floats
df['scentLover'] = df['scentLover'].apply(textScaleToFloat)

# replaces the nan with the avg
df['scentLover'] = df['scentLover'].fillna(df['scentLover'].mean(skipna=True))

Handles the closest shop

In [57]:
df['magasin']           = df['closestShop'].apply(lambda e: 1 if e =='1. Magasin de Proximit�' else -1)
df['moyenneSurface']    = df['closestShop'].apply(lambda e: 1 if e =='2. Moyenne Surface' else -1)
df['superMarket']       = df['closestShop'].apply(lambda e: 1 if e =='3. SuperMarket' else -1)
df['hyperMarket']       = df['closestShop'].apply(lambda e: 1 if e =='4. HyperMarket' else -1)
df['drive']             = df['closestShop'].apply(lambda e: 1 if e =='5. Drive' else -1)
df['hardDiscount']      = df['closestShop'].apply(lambda e: 1 if e =='6. Hard Discounter' else -1)

# removes extra column
df = df.drop(columns=['closestShop'])

In [58]:
df.to_csv('./DS_CentraleSupelec_ST42021/clean.csv')  

# Using consumeractions data set

In order to have more information about customers, we are going to add features, using the data set "consumer actions"


In [59]:
#Let's get the csv consumer_actions
df_consumer_actions = pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_consumeractions.csv")

#Let's check the possible values for 'event'
df_consumer_actions.event.unique()



array(['Product Reviewed', 'Email Clicked', 'Email Opened',
       'Request Coupon Add To Basket', 'Coupon Redemption',
       'Request Coupon Print', 'Search Site'], dtype=object)

In [60]:
df_consumer_actions.brandName.unique()

array(['Fairy PEPS Platinum+ Tout-en-1', nan, 'Ariel', 'Lenor', 'Febreze',
       'Dash 2 in 1', 'Head & Shoulders', 'Dash', 'Minor Brands',
       'Mr. Propre Gomme Magique',
       'Balai Swiffer avec lingettes sèches et humides',
       'Head&Shoulders Anti-Démangeaisons',
       'Head & Shoulders Citrus Fresh',
       'Lingettes humides Swiffer pour le sol',
       "Febreze Plaisir d'Air Fleur de Vanille",
       'Ariel Simply Fraîcheur Marseille et Aloe Vera',
       'Dash Allin1 Pods Caresse Aérienne', 'Satin Care', 'Swiffer',
       'Couches Pampers® Baby-Dry™', 'Mr Proper', 'Oral-B Power',
       'Antikal Classic Spray', 'Jolly', 'Oral-B Manual', 'Fixodent',
       'Always', 'Aussie', 'Tampax', 'Antikal', 'Herbal Essences',
       'ZzzQuil™ SOMMEIL',
       'Mr. Propre Ultra Power Liquide Multi-Surfaces',
       'Lenor 0% Colorant Aloe Vera', 'Fairy PEPS Tout-en-1 Plus',
       "Dash 2en1 Liquide La Collection Envolée d'Air",
       'Ariel Allin1 Pods Alpine', 'Febreze Textile 

In [61]:
### First step : we will determine if a user has had an action related to the product 'Fairy PEPS'. We have seen in the list of brands that Fairy PEPS can be written 'Fairy PEPS Platinum+ Tout-en-1' or 'Fairy PEPS Tout-en-1 Plus'

In [62]:
#Let's create target text
target_brand1 = 'Fairy PEPS Platinum+ Tout-en-1'
target_brand2 = 'Fairy PEPS Tout-en-1 Plus'

In [63]:
#We want to test if the brandName related to each action is Fairy PEPS
bool1 = df_consumer_actions['brandName']== target_brand1
bool2 = df_consumer_actions['brandName']== target_brand2
bool3 = df_consumer_actions['brandName2']== target_brand1
bool4 = df_consumer_actions['brandName2']== target_brand2

df_consumer_actions['brandName_bool'] = bool1 | bool2
df_consumer_actions['brandName2_bool'] = bool3 | bool4 

#Let's create a new column for each row telling us if the brandname 1 or 2 of the email was FairyPEPS
df_consumer_actions['brand'] = df_consumer_actions['brandName_bool'] | df_consumer_actions['brandName2_bool']
#We want it in the 1 or -1 format
df_consumer_actions['brand'] = df_consumer_actions.apply(lambda row: 1 if row['brand'] else -1, axis=1)




In [64]:
#We want to evaluate to level of interest of each user about the emails, coupons or products of P&G depending on their action

def levelOfInterestAboutMarketing(action: str) -> float:
    if action == 'Email Opened': return 1/5
    if action == 'Email Clicked': return 2/5
    if action == 'Search Site': return 3/5
    if action == 'Product Reviewed': return 4/5
    if action == 'Request Coupon Print': return 1
    if action == 'Request Coupon Add To Basket': return 1
    if action == 'Coupon Redemption': return 1
    
    
    
    return None

df_consumer_actions['level_of_interest_about_marketing'] = df_consumer_actions['event'].apply(levelOfInterestAboutMarketing)
df_consumer_actions

Unnamed: 0,userId,event,originalTimestamp,brandName,brandName2,subjectLine,url,keyword,couponRedemptionDate,brandName_bool,brandName2_bool,brand,level_of_interest_about_marketing
0,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,Product Reviewed,2021-05-13 22:57:44.182000+00:00,Fairy PEPS Platinum+ Tout-en-1,,Fairy PEPS Platinum+ Tout-en-1 | Envie de Plus,https://www.enviedeplus.com/marques/fairy/fair...,,,True,False,1,0.8
1,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,Email Clicked,2021-05-06 20:36:14+00:00,,,Lenor vous fait une fleur <%${user['FirstName'...,https://www.enviedeplus.com/50-euros-chez-truf...,,,False,False,-1,0.4
2,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,Email Opened,2020-09-17 19:35:13+00:00,,,Ariel Pods x Whirlpool = 💝,,,,False,False,-1,0.2
3,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,Email Opened,2020-10-23 06:18:35+00:00,,,"Bye-bye la grisaille, bonjour la couleur 🍂",,,,False,False,-1,0.2
4,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,Email Opened,2021-05-10 19:50:41+00:00,,,Il faut qu’on parle <%${user['FirstName']}%>,,,,False,False,-1,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1172500,use_sLe3AqB7h7jlypX6JmC1TzdphFJ,Email Opened,2020-12-01 12:19:38+00:00,,,Jusqu'à 85 € de réductions* avec les Jours en ...,,,,False,False,-1,0.2
1172501,use_fLS7y8W7sAXSCohYlWi1TzdXPjQ,Email Opened,2020-08-04 07:46:36+00:00,,,Envie de... faire une bonne action ? 🤲,,,,False,False,-1,0.2
1172502,use_YyWKhnJXM3bUmsc66TA1TzdcFXm,Email Opened,2020-11-05 15:49:48+00:00,,,Votre surprise pour une peau nette 🎁,,,,False,False,-1,0.2
1172503,use_8th6Vk4n1QZDcXwBfU41TzdfDDF,Email Opened,2020-12-04 10:02:03+00:00,,,🖤 Profitez de vos marques préférées avec le Bl...,,,,False,False,-1,0.2


In [65]:
#As each user appears several time, we have to group by userId the rows and apply aggregate functions. It allows us to get a new information : the number of actions done by an user. It requires two groupby and a merge of the two results.
df_consumer_actions_cleaned_one = df_consumer_actions.groupby(['userId'], sort=False, as_index=False)["brand",'level_of_interest_about_marketing'].max()
df_consumer_actions_cleaned_two = df_consumer_actions.groupby(['userId'], sort=False, as_index=False)['event'].count()


  df_consumer_actions_cleaned_one = df_consumer_actions.groupby(['userId'], sort=False, as_index=False)["brand",'level_of_interest_about_marketing'].max()


In [66]:

df_consumer_actions_cleaned = df_consumer_actions_cleaned_one.merge(df_consumer_actions_cleaned_two, left_on ='userId', right_on='userId',  how='left')

In [67]:
#The number of action is an integer : we have to scale it from 0 to 1
minV = df_consumer_actions_cleaned['event'].min()
maxV = df_consumer_actions_cleaned['event'].max()
df_consumer_actions_cleaned['event'] = df_consumer_actions_cleaned['event'].apply(lambda e: (e - minV) / (maxV - minV))

In [68]:
#Let's check the results
df_consumer_actions_cleaned

Unnamed: 0,userId,brand,level_of_interest_about_marketing,event
0,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,1,0.8,0.003619
1,use_zfO6MmwMKMABup7Fs9E1TzdcfQy,1,1.0,0.036711
2,use_ejDjloCwQD84hJpLAAs1TzdfSXa,1,1.0,0.019131
3,use_CAJDPBtqY7NfMHkKbYQ1TzdcUuk,-1,1.0,0.057394
4,use_azRTZAf4XAfhkCMfvQz1TzdppT4,1,0.8,0.150982
...,...,...,...,...
14741,use_sLe3AqB7h7jlypX6JmC1TzdphFJ,-1,0.2,0.000000
14742,use_fLS7y8W7sAXSCohYlWi1TzdXPjQ,-1,0.2,0.000000
14743,use_YyWKhnJXM3bUmsc66TA1TzdcFXm,-1,0.2,0.000000
14744,use_8th6Vk4n1QZDcXwBfU41TzdfDDF,-1,0.2,0.000000


In [69]:
#the column "brand" should be renamed to be more relevant
df_consumer_actions_cleaned = df_consumer_actions_cleaned.rename(columns = {"brand" : "interested_by_fairypeps_email", "event": "number_of_actions"})


In [70]:
df_consumer_actions_cleaned

Unnamed: 0,userId,interested_by_fairypeps_email,level_of_interest_about_marketing,number_of_actions
0,use_bw7Dd2MM9RLPaulrFsQ1TzdmFsi,1,0.8,0.003619
1,use_zfO6MmwMKMABup7Fs9E1TzdcfQy,1,1.0,0.036711
2,use_ejDjloCwQD84hJpLAAs1TzdfSXa,1,1.0,0.019131
3,use_CAJDPBtqY7NfMHkKbYQ1TzdcUuk,-1,1.0,0.057394
4,use_azRTZAf4XAfhkCMfvQz1TzdppT4,1,0.8,0.150982
...,...,...,...,...
14741,use_sLe3AqB7h7jlypX6JmC1TzdphFJ,-1,0.2,0.000000
14742,use_fLS7y8W7sAXSCohYlWi1TzdXPjQ,-1,0.2,0.000000
14743,use_YyWKhnJXM3bUmsc66TA1TzdcFXm,-1,0.2,0.000000
14744,use_8th6Vk4n1QZDcXwBfU41TzdfDDF,-1,0.2,0.000000


In [71]:
#Let's add the new columns, gotten in consumeractions, to the initial DataFrame 
df_merged = df.merge(df_consumer_actions_cleaned, left_on ='userId', right_on='userId',  how='left')
print(df.shape)
print(df_consumer_actions_cleaned.shape)
print(df_merged.shape)

(12000, 24)
(14746, 4)
(12000, 27)


In [72]:
#Let's check the result
df_merged

Unnamed: 0,userId,age,gender,scentLover,ecoInterest,washDishes,MrPropre,Antikal,Ariel,Dash,...,nbChildren,magasin,moyenneSurface,superMarket,hyperMarket,drive,hardDiscount,interested_by_fairypeps_email,level_of_interest_about_marketing,number_of_actions
0,use_XbA1FTDcCrTMNTHK1851TzjyPMP,0.077922,1,0.000000,0.388626,1,-1,-1,-1,-1,...,0.2,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.002068
1,use_3WHgsMVGSg5MHG2zja91TzdfmY2,0.636364,1,0.378981,0.666667,1,1,1,1,-1,...,0.6,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.022234
2,use_2SisOiR8QwDaHI4svm11TzcvK7V,0.350649,1,0.000000,0.666667,1,-1,-1,1,1,...,0.2,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.009824
3,use_W0T1LmfKaQPJYD1RTWh1Tzdc69H,0.220779,1,0.333333,0.333333,-1,-1,-1,1,1,...,0.4,-1,-1,-1,-1,-1,-1,-1.0,0.4,0.002327
4,use_KwYWZ7UMl4veveOaPGK1TzddtmI,0.259740,1,0.378981,0.000000,-1,1,1,1,1,...,0.0,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.936143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,use_ICSyEvCcRNPpQeBdWcM1TzdczV8,0.454545,1,0.378981,0.388626,1,-1,-1,-1,-1,...,1.0,-1,-1,-1,-1,-1,-1,1.0,1.0,0.043175
11996,use_Q5DNYeSmG7g6VeMV4n71TzeTRRz,0.324675,1,0.333333,0.333333,-1,-1,-1,-1,-1,...,0.0,-1,-1,-1,-1,-1,-1,-1.0,0.2,0.000517
11997,use_rEoce5pc0MlKpndfzNj1Tze3J1L,0.311688,1,0.378981,0.666667,-1,-1,-1,-1,-1,...,0.6,-1,-1,-1,-1,-1,-1,-1.0,0.4,0.009824
11998,use_0AuDL2wOJHy9I16zSC21TzeHRMo,0.272727,-1,0.378981,0.388626,1,-1,-1,-1,-1,...,0.2,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.000259


Finally, we don't need anymore the column userId which is not useful for the algorithms

In [73]:
#We do not need the other columns and the userId
df_merged =  df_merged.drop(columns=['userId'])
df_merged

Unnamed: 0,age,gender,scentLover,ecoInterest,washDishes,MrPropre,Antikal,Ariel,Dash,pods,...,nbChildren,magasin,moyenneSurface,superMarket,hyperMarket,drive,hardDiscount,interested_by_fairypeps_email,level_of_interest_about_marketing,number_of_actions
0,0.077922,1,0.000000,0.388626,1,-1,-1,-1,-1,-1,...,0.2,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.002068
1,0.636364,1,0.378981,0.666667,1,1,1,1,-1,1,...,0.6,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.022234
2,0.350649,1,0.000000,0.666667,1,-1,-1,1,1,-1,...,0.2,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.009824
3,0.220779,1,0.333333,0.333333,-1,-1,-1,1,1,1,...,0.4,-1,-1,-1,-1,-1,-1,-1.0,0.4,0.002327
4,0.259740,1,0.378981,0.000000,-1,1,1,1,1,-1,...,0.0,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.936143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0.454545,1,0.378981,0.388626,1,-1,-1,-1,-1,1,...,1.0,-1,-1,-1,-1,-1,-1,1.0,1.0,0.043175
11996,0.324675,1,0.333333,0.333333,-1,-1,-1,-1,-1,-1,...,0.0,-1,-1,-1,-1,-1,-1,-1.0,0.2,0.000517
11997,0.311688,1,0.378981,0.666667,-1,-1,-1,-1,-1,-1,...,0.6,-1,-1,-1,-1,-1,-1,-1.0,0.4,0.009824
11998,0.272727,-1,0.378981,0.388626,1,-1,-1,-1,-1,-1,...,0.2,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.000259


In [74]:
#Reindexing of the columns to have washDishes on the last column

df_merged = df_merged.reindex(columns=[
'userId',
'age',
'gender',
'scentLover',
'ecoInterest',
'MrPropre',
'Antikal',
'Ariel',
'Dash',
'pods',
'powder',
'liquid',
'electricToothbrush',
'likesPets',
'hasPet',
'daysSinceActivity',
'nbChildren',
'magasin',
'moyenneSurface',
'superMarket',
'hyperMarket',
'drive',
'hardDiscount',
'interested_by_fairypeps_email',
'level_of_interest_about_marketing',
'number_of_actions',
'washDishes',])

df_train = df_merged

In [75]:
#The final result is..
df_train

Unnamed: 0,userId,age,gender,scentLover,ecoInterest,MrPropre,Antikal,Ariel,Dash,pods,...,magasin,moyenneSurface,superMarket,hyperMarket,drive,hardDiscount,interested_by_fairypeps_email,level_of_interest_about_marketing,number_of_actions,washDishes
0,,0.077922,1,0.000000,0.388626,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.002068,1
1,,0.636364,1,0.378981,0.666667,1,1,1,-1,1,...,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.022234,1
2,,0.350649,1,0.000000,0.666667,-1,-1,1,1,-1,...,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.009824,1
3,,0.220779,1,0.333333,0.333333,-1,-1,1,1,1,...,-1,-1,-1,-1,-1,-1,-1.0,0.4,0.002327,-1
4,,0.259740,1,0.378981,0.000000,1,1,1,1,-1,...,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.936143,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,,0.454545,1,0.378981,0.388626,-1,-1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,1.0,1.0,0.043175,1
11996,,0.324675,1,0.333333,0.333333,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1.0,0.2,0.000517,-1
11997,,0.311688,1,0.378981,0.666667,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1.0,0.4,0.009824,-1
11998,,0.272727,-1,0.378981,0.388626,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1.0,1.0,0.000259,1
