# EI ST4

## Imports

In [1]:
import pandas as pd
from datetime import datetime

In [2]:
df = pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_train.csv")

## Cleaning up the dataframe

In [3]:
df

Unnamed: 0,userId,age,gender,registrationDate,languageCode,countryCode,emailContactable,postalContactable,postalCode,numberChildren,...,detergentType,MrPropreTrier,AntikalTrier,ArielTrier,DashTrier,scentLover,petOwner,ecoInterest,closestShop,washDishes
0,use_XbA1FTDcCrTMNTHK1851TzjyPMP,24,F,2019-10-14T17:33:37Z,fr,FRA,True,True,68540,1,...,Liquid,,,,,NonUser,Yes,,,Auto
1,use_3WHgsMVGSg5MHG2zja91TzdfmY2,67,F,2017-07-25T07:00:31Z,fr,FRA,True,True,17390,3,...,Pods,Known Trier,Known Trier,Known Trier,,,No,High,,Auto
2,use_2SisOiR8QwDaHI4svm11TzcvK7V,45,F,2015-06-25T00:00:00Z,fr,FRA,True,True,30200,1,...,Liquid,,,Known Trier,Known Trier,NonUser,Yes,High,,Auto
3,use_W0T1LmfKaQPJYD1RTWh1Tzdc69H,35,F,2018-01-31T07:24:39Z,fr,FRA,True,True,61400,2,...,Pods,,,Known Trier,Known Trier,Low,Yes,Medium,,Hand
4,use_KwYWZ7UMl4veveOaPGK1TzddtmI,38,F,2016-01-26T11:42:17Z,fr,FRA,True,True,30620,0,...,Liquid,Known Trier,Known Trier,Known Trier,Known Trier,,Yes,Low,,Hand
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,use_ICSyEvCcRNPpQeBdWcM1TzdczV8,53,F,2015-03-11T20:03:16Z,fr,FRA,True,True,77240,4+,...,Pods,,,,,,Yes,,,Auto
11996,use_Q5DNYeSmG7g6VeMV4n71TzeTRRz,43,F,2015-10-10T19:16:19Z,fr,FRA,True,True,45120,0,...,Liquid,,,,,Low,Yes,Medium,,Hand
11997,use_rEoce5pc0MlKpndfzNj1Tze3J1L,42,F,2015-05-08T02:59:34Z,fr,FRA,True,True,35170,3,...,Liquid,,,,,,Yes,High,,Hand
11998,use_0AuDL2wOJHy9I16zSC21TzeHRMo,39,M,2019-05-20T11:59:02Z,fr,FRA,True,True,45160,1,...,Liquid,,,,,,Yes,,,Auto


We can remove the `languageCode` and `countryCode` column as they are constant

In [4]:
df[~df.languageCode.isin(['fr', 'FR'])].empty and df[~df.countryCode.eq('FRA')].empty

True

In [5]:
df = df.drop(columns=['languageCode', 'countryCode'])

We will also remove the `userId`, `registrationDate`, `reactivationValue`, `emailContactable` and `postalContactable` as they are irrelevant

In [6]:
df = df.drop(columns=['userId', 'registrationDate', 'reactivationValue', 'emailContactable', 'postalContactable'])

We are going to replace the `washDishes` `STRING` column with a `ownsDishwasher` `INT` column

In [7]:
df['ownsDishwasher'] = df.apply(lambda row: 1 if row['washDishes'] == 'Auto' else 0, axis=1)
df = df.drop(columns=["washDishes"])

In the `MrPropreTrier`, `AntikalTrier`, `ArielTrier`, `DashTrier` we will replace `Known Trier` with `1` and `Nan` with `0`

In [8]:
for c in ['MrPropreTrier', 'AntikalTrier', 'ArielTrier', 'DashTrier']:
    df[c[:-5]] = df.apply(lambda row: 1 if row[c] == 'Known Trier' else 0, axis=1)
    df = df.drop(columns=[c])

We will replace `detergentType` by `liquidDetergent`

In [9]:
df['liquidDetergent'] = df.apply(lambda row: 1 if row['detergentType'] == 'Liquid' else 0, axis=1)
df = df.drop(columns=['detergentType'])

Replacing `toothBrushType` with `electricToothbrush`

In [10]:
df['electricToothbrush'] = df.apply(lambda row: 1 if row['toothBrushType'] == 'Electric' else 0, axis=1)
df = df.drop(columns=['toothBrushType'])

Replacing `petOwner` with `hasPet`

In [11]:
df['hasPet'] = df.apply(lambda row: 1 if row['petOwner'] == 'Yes' else 0, axis=1)
df = df.drop(columns=['petOwner'])

Transforms a `ISO 8601` time string to the amount of days since the time string

In [17]:
def timeStringToDelta(timeString: str) -> int:
    return int((datetime.utcnow().timestamp() - datetime.strptime(timeString[:10], "%Y-%m-%d").timestamp()) / 3600 / 24)

Replaces `lastActivityDate` with `daysSinceActivity`: an `integer` representing the amount of ellapsed days since last activity

In [18]:
df[''] = df.apply(lambda row: timeStringToDelta(row['lastActivityDate']), axis=1)
df = df.drop(columns=['lastActivityDate'])

In [19]:
df

Unnamed: 0,age,gender,postalCode,numberChildren,scentLover,ecoInterest,closestShop,ownsDishwasher,MrPropre,Antikal,Ariel,Dash,liquidDetergent,electricToothbrush,hasPet,durationSinceLastActivity
0,24,F,68540,1,NonUser,,,1,0,0,0,0,1,0,1,307
1,67,F,17390,3,,High,,1,1,1,1,0,0,0,0,27
2,45,F,30200,1,NonUser,High,,1,0,0,1,1,1,0,1,129
3,35,F,61400,2,Low,Medium,,0,0,0,1,1,0,0,1,339
4,38,F,30620,0,,Low,,0,1,1,1,1,1,1,1,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,53,F,77240,4+,,,,1,0,0,0,0,0,0,1,9
11996,43,F,45120,0,Low,Medium,,0,0,0,0,0,1,1,1,166
11997,42,F,35170,3,,High,,0,0,0,0,0,1,0,1,32
11998,39,M,45160,1,,,,1,0,0,0,0,1,1,1,259


In [16]:
d = '2021-05-06T20:18:39.453599Z'
d[:10]

'2021-05-06'