# EI ST4

## Imports

In [None]:
import pandas as pd
from datetime import datetime
import math

In [None]:
df = pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_train.csv")

## Cleaning up the dataframe

In [None]:
df

Get unique count for each variable

In [None]:
df.nunique()

Check variable data type

In [None]:
df.dtypes

We can remove the `languageCode` and `countryCode` column as they are constant

In [None]:
df[~df.languageCode.isin(['fr', 'FR'])].empty and df[~df.countryCode.eq('FRA')].empty

In [None]:
df = df.drop(columns=['languageCode', 'countryCode'])

We will also remove the `registrationDate`, `reactivationValue`, `emailContactable` and `postalContactable` as they are irrelevant. `postalCode` as it will be to hard to analyse.

In [None]:
df = df.drop(columns=['registrationDate', 'reactivationValue', 'emailContactable', 'postalContactable', 'postalCode'])

We are going to replace the `washDishes` `STRING` column with an `INT` column

In [None]:
df["washDishes"] = df["washDishes"].apply(lambda e: 1 if e == 'Auto' else -1)

In the `MrPropreTrier`, `AntikalTrier`, `ArielTrier`, `DashTrier` we will replace `Known Trier` with `1` and `Nan` with `-1`

In [None]:
for c in ['MrPropreTrier', 'AntikalTrier', 'ArielTrier', 'DashTrier']:
    df[c[:-5]] = df.apply(lambda row: 1 if row[c] == 'Known Trier' else -1, axis=1)
    df = df.drop(columns=[c])

We will replace `detergentType` by `liquid`, `pods` and `powder` 

In [None]:
# pods
df['pods'] = df['detergentType'].apply(lambda e: 1 if pd.notna(e) and 'Pods' in e else -1)

# powder
df['powder'] = df['detergentType'].apply(lambda e: 1 if pd.notna(e) and 'Powder' in e else -1)

# liquid
df['liquid'] = df['detergentType'].apply(lambda e: 1 if pd.notna(e) and 'Liquid' in e else -1)

# removes extra column
df = df.drop(columns=['detergentType']) 

Replacing `toothBrushType` with `electricToothbrush`

In [None]:
df['electricToothbrush'] = df.apply(lambda row: 1 if row['toothBrushType'] == 'Electric' else -1, axis=1)
df = df.drop(columns=['toothBrushType'])

Replacing `petOwner` with `hasPet`, if the first 3 characters are 'Yes' or 'Oui' the value is `1`, else it is `-1`

In [None]:
def likesPets(s:str) -> float:
    if s in ['Yes, we love our furry friends', "Oui, j'adore nos petites boules de poils !"]: 
        return 1

    if s in ['Yes']: 
        return 0.9

    if s in ["Non, j'aime les animaux, mais je n'en ai pas pour le moment.", 'No, we love animals but no pets here for now']:   
        return 0.8
    
    if s in ['No - maybe future']:
        return 0.5

    if s in ['No']:
        return 0.3

    if s in ['Des animaux dans la maison ? Non merci !', "No, we'd never have pets in the house"]:
        return 0

# creating a new column for animal lovers
df['likesPets'] = df['petOwner'].apply(likesPets)

# creating a new column for pet owners
df['hasPet'] = df['petOwner'].apply(lambda e: 1 if str(e)[:3] == 'Yes' or str(e)[:3] == 'Oui' else -1)

df = df.drop(columns=['petOwner'])

Transforms a `ISO 8601` time string to the amount of days since the time string

In [None]:
def timeStringToDelta(timeString: str) -> int:
    if pd.isna(timeString): return None
    return int((datetime.utcnow().timestamp() - datetime.strptime(timeString[:10], "%Y-%m-%d").timestamp()) / 3600 / 24)

Replaces `lastActivityDate` with `daysSinceActivity`: an `integer` representing the amount of ellapsed days since last activity

In [None]:
df['daysSinceActivity'] = df['lastActivityDate'].apply(timeStringToDelta)

# normalizess the new column
minV = df['daysSinceActivity'].min()
maxV = df['daysSinceActivity'].max()
df['daysSinceActivity'] = df['daysSinceActivity'].apply(lambda e: (e - minV) / (maxV - minV))

# replaces Nan by the avg
df['daysSinceActivity'].fillna(df['daysSinceActivity'].mean(skipna=True))

# removes the extra column
df = df.drop(columns=['lastActivityDate'])

Replaces `age` with a normalized column

In [None]:
# normalizess the new column
minV = df['age'].min()
maxV = df['age'].max()
df['age'] = df['age'].apply(lambda e: (e - minV) / (maxV - minV))

# replaces Nan by the avg
df['age'].fillna(df['age'].mean(skipna=True))

The possible values for `numberChildren` are `'1'`, `'2'`, `'3'`, `'4'`, `'4+'` and `'NaN'`. We will be creating a column `nbChildren` of type `int` where `'NaN'` will be mapped to `None`.

In [None]:
def childrenMagik(children: str) -> float:
    if children == '0': return 0
    if children == '1': return 1 / 5
    if children == '2': return 2 / 5
    if children == '3': return 3 / 5
    if children == '4': return 4 / 5
    if children == '4+': return 1
    return None

df['nbChildren'] = df['numberChildren'].apply(childrenMagik)

# Replacing nan with the avg
df['nbChildren'] = df['nbChildren'].fillna(df['nbChildren'].mean(skipna=True))

df = df.drop(columns=['numberChildren'])

Replaces `F` with `1` and `M` with `-1` in the `gender` column

In [None]:
df['gender'] = df['gender'].apply(lambda e: 1 if e == "F" else -1)

Changes `ecoInterest` replacing `High`, `Medium`, `Low`, `Very high` with a scale going from `0` to `1`. We the replace `nan` with the avg.

In [None]:
def textScaleToFloat(s:str) -> float:
    if s == 'Very high': return 1
    if s == 'High': return 2 / 3
    if s == 'Medium': return 1 / 3
    if s == 'Low': return 0
    return None

# replaces the 'normal' values with floats
df['ecoInterest'] = df['ecoInterest'].apply(textScaleToFloat)

# replaces the nan with the avg
df['ecoInterest'] = df['ecoInterest'].fillna(df['ecoInterest'].mean(skipna=True))

Changes `scentLover` replacing `NonUser`, `Low`, `Medium`, `High` with a scale going from 0 to 1. We the replace `nan` with the avg.

In [None]:
def textScaleToFloat(s:str) -> float:
    if s == 'High': return 1
    if s == 'Medium': return 2 / 3
    if s == 'Low': return 1 / 3
    if s == 'NonUser': return 0
    return None

# replaces the 'normal' values with floats
df['scentLover'] = df['scentLover'].apply(textScaleToFloat)

# replaces the nan with the avg
df['scentLover'] = df['scentLover'].fillna(df['scentLover'].mean(skipna=True))

Handles the closest shop

In [None]:
df['magasin']           = df['closestShop'].apply(lambda e: 1 if e =='1. Magasin de Proximit�' else -1)
df['moyenneSurface']    = df['closestShop'].apply(lambda e: 1 if e =='2. Moyenne Surface' else -1)
df['superMarket']       = df['closestShop'].apply(lambda e: 1 if e =='3. SuperMarket' else -1)
df['hyperMarket']       = df['closestShop'].apply(lambda e: 1 if e =='4. HyperMarket' else -1)
df['drive']             = df['closestShop'].apply(lambda e: 1 if e =='5. Drive' else -1)
df['hardDiscount']      = df['closestShop'].apply(lambda e: 1 if e =='6. Hard Discounter' else -1)

# removes extra column
df = df.drop(columns=['closestShop'])

In [None]:
df.to_csv('./DS_CentraleSupelec_ST42021/clean.csv')  

# Using consumeractions data set

In order to have more information about customers, we are going to add features, using the data set "consumer actions"


In [None]:
#Let's get the csv consumer_actions
df_consumer_actions = pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_consumeractions.csv")

#Let's check the possible values for 'event'
df_consumer_actions.event.unique()



In [None]:
df_consumer_actions.brandName.unique()

In [None]:
### First step : we will determine if a user has had an action related to the product 'Fairy PEPS'. We have seen in the list of brands that Fairy PEPS can be written 'Fairy PEPS Platinum+ Tout-en-1' or 'Fairy PEPS Tout-en-1 Plus'

In [None]:
#Let's create target text
target_brand1 = 'Fairy PEPS Platinum+ Tout-en-1'
target_brand2 = 'Fairy PEPS Tout-en-1 Plus'

In [None]:
#We want to test if the brandName related to each action is Fairy PEPS
bool1 = df_consumer_actions['brandName']== target_brand1
bool2 = df_consumer_actions['brandName']== target_brand2
bool3 = df_consumer_actions['brandName2']== target_brand1
bool4 = df_consumer_actions['brandName2']== target_brand2

df_consumer_actions['brandName_bool'] = bool1 | bool2
df_consumer_actions['brandName2_bool'] = bool3 | bool4 

#Let's create a new column for each row telling us if the brandname 1 or 2 of the email was FairyPEPS
df_consumer_actions['brand'] = df_consumer_actions['brandName_bool'] | df_consumer_actions['brandName2_bool']
#We want it in the 1 or -1 format
df_consumer_actions['brand'] = df_consumer_actions.apply(lambda row: 1 if row['brand'] else -1, axis=1)




In [None]:
#We want to evaluate to level of interest of each user about the emails, coupons or products of P&G depending on their action

def levelOfInterestAboutMarketing(action: str) -> float:
    if action == 'Email Opened': return 1/5
    if action == 'Email Clicked': return 2/5
    if action == 'Search Site': return 3/5
    if action == 'Product Reviewed': return 4/5
    if action == 'Request Coupon Print': return 1
    if action == 'Request Coupon Add To Basket': return 1
    if action == 'Coupon Redemption': return 1
    
    
    
    return None

df_consumer_actions['level_of_interest_about_marketing'] = df_consumer_actions['event'].apply(levelOfInterestAboutMarketing)
df_consumer_actions

In [None]:
#As each user appears several time, we have to group by userId the rows and apply aggregate functions. It allows us to get a new information : the number of actions done by an user. It requires two groupby and a merge of the two results.
df_consumer_actions_cleaned_one = df_consumer_actions.groupby(['userId'], sort=False, as_index=False)["brand",'level_of_interest_about_marketing'].max()
df_consumer_actions_cleaned_two = df_consumer_actions.groupby(['userId'], sort=False, as_index=False)['event'].count()


In [None]:

df_consumer_actions_cleaned = df_consumer_actions_cleaned_one.merge(df_consumer_actions_cleaned_two, left_on ='userId', right_on='userId',  how='left')

In [None]:
#The number of action is an integer : we have to scale it from 0 to 1
minV = df_consumer_actions_cleaned['event'].min()
maxV = df_consumer_actions_cleaned['event'].max()
df_consumer_actions_cleaned['event'] = df_consumer_actions_cleaned['event'].apply(lambda e: (e - minV) / (maxV - minV))

In [None]:
#Let's check the results
df_consumer_actions_cleaned

In [None]:
#the column "brand" should be renamed to be more relevant
df_consumer_actions_cleaned = df_consumer_actions_cleaned.rename(columns = {"brand" : "interested_by_fairypeps_email", "event": "number_of_actions"})


In [None]:
df_consumer_actions_cleaned

In [None]:
#Let's add the new columns, gotten in consumeractions, to the initial DataFrame 
df_merged = df.merge(df_consumer_actions_cleaned, left_on ='userId', right_on='userId',  how='left')
print(df.shape)
print(df_consumer_actions_cleaned.shape)
print(df_merged.shape)

In [None]:
#Let's check the result
df_merged

Finally, we don't need anymore the column userId which is not useful for the algorithms

In [None]:
#We do not need the other columns and the userId
df_merged =  df_merged.drop(columns=['userId'])
df_merged

In [None]:
#Reindexing of the columns to have washDishes on the last column

df_merged = df_merged.reindex(columns=[
'userId',
'age',
'gender',
'scentLover',
'ecoInterest',
'MrPropre',
'Antikal',
'Ariel',
'Dash',
'pods',
'powder',
'liquid',
'electricToothbrush',
'likesPets',
'hasPet',
'daysSinceActivity',
'nbChildren',
'magasin',
'moyenneSurface',
'superMarket',
'hyperMarket',
'drive',
'hardDiscount',
'interested_by_fairypeps_email',
'level_of_interest_about_marketing',
'number_of_actions',
'washDishes',])

df_train = df_merged

In [None]:
#The final result is..
df_train