# EI ST4

## Imports

In [None]:
import pandas as pd
from datetime import datetime
import math

In [None]:
df_test = pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_test.csv")

## Cleaning up the dataframe

In [None]:
df_test

Get unique count for each variable

In [None]:
df_test.nunique()

Check variable data type

In [None]:
df_test.dtypes

We can remove the `languageCode` and `countryCode` column as they are constant

In [None]:
df_test[~df_test.languageCode.isin(['fr', 'FR'])].empty and df_test[~df_test.countryCode.eq('FRA')].empty

In [None]:
df_test = df_test.drop(columns=['languageCode', 'countryCode'])

We will also remove the `userId`, `registrationDate`, `reactivationValue`, `emailContactable` and `postalContactable` as they are irrelevant. `postalCode` as it will be to hard to analyse.

In [None]:
df_test = df_test.drop(columns=['userId', 'registrationDate', 'reactivationValue', 'emailContactable', 'postalContactable', 'postalCode'])

In the `MrPropreTrier`, `AntikalTrier`, `ArielTrier`, `DashTrier` we will replace `Known Trier` with `1` and `Nan` with `-1`

In [None]:
for c in ['MrPropreTrier', 'AntikalTrier', 'ArielTrier', 'DashTrier']:
    df_test[c[:-5]] = df_test.apply(lambda row: 1 if row[c] == 'Known Trier' else -1, axis=1)
    df_test = df_test.drop(columns=[c])

We will replace `detergentType` by `liquid`, `pods` and `powder` 

In [None]:
# pods
df_test['pods'] = df_test['detergentType'].apply(lambda e: 1 if pd.notna(e) and 'Pods' in e else -1)

# powder
df_test['powder'] = df_test['detergentType'].apply(lambda e: 1 if pd.notna(e) and 'Powder' in e else -1)

# liquid
df_test['liquid'] = df_test['detergentType'].apply(lambda e: 1 if pd.notna(e) and 'Liquid' in e else -1)

# removes extra column
df_test = df_test.drop(columns=['detergentType']) 

Replacing `toothBrushType` with `electricToothbrush`

In [None]:
df_test['electricToothbrush'] = df_test.apply(lambda row: 1 if row['toothBrushType'] == 'Electric' else -1, axis=1)
df_test = df_test.drop(columns=['toothBrushType'])

Replacing `petOwner` with `hasPet`, if the first 3 characters are 'Yes' or 'Oui' the value is `1`, else it is `-1`

In [None]:
def likesPets(s:str) -> float:
    if s in ['Yes, we love our furry friends', "Oui, j'adore nos petites boules de poils !"]: 
        return 1

    if s in ['Yes']: 
        return 0.9

    if s in ["Non, j'aime les animaux, mais je n'en ai pas pour le moment.", 'No, we love animals but no pets here for now']:   
        return 0.8
    
    if s in ['No - maybe future']:
        return 0.5

    if s in ['No']:
        return 0.3

    if s in ['Des animaux dans la maison ? Non merci !', "No, we'd never have pets in the house"]:
        return 0

# creating a new column for animal lovers
df_test['likesPets'] = df_test['petOwner'].apply(likesPets)

# creating a new column for pet owners
df_test['hasPet'] = df_test['petOwner'].apply(lambda e: 1 if str(e)[:3] == 'Yes' or str(e)[:3] == 'Oui' else -1)

df_test = df_test.drop(columns=['petOwner'])

Transforms a `ISO 8601` time string to the amount of days since the time string

In [None]:
def timeStringToDelta(timeString: str) -> int:
    if pd.isna(timeString): return None
    return int((datetime.utcnow().timestamp() - datetime.strptime(timeString[:10], "%Y-%m-%d").timestamp()) / 3600 / 24)

Replaces `lastActivityDate` with `daysSinceActivity`: an `integer` representing the amount of ellapsed days since last activity

In [None]:
df_test['daysSinceActivity'] = df_test['lastActivityDate'].apply(timeStringToDelta)

# normalizess the new column
minV = df_test['daysSinceActivity'].min()
maxV = df_test['daysSinceActivity'].max()
df_test['daysSinceActivity'] = df_test['daysSinceActivity'].apply(lambda e: (e - minV) / (maxV - minV))

# replaces Nan by the avg
df_test['daysSinceActivity'].fillna(df_test['daysSinceActivity'].mean(skipna=True))

# removes the extra column
df_test = df_test.drop(columns=['lastActivityDate'])

Replaces `age` with a normalized column

In [None]:
# normalizess the new column
minV = df_test['age'].min()
maxV = df_test['age'].max()
df_test['age'] = df_test['age'].apply(lambda e: (e - minV) / (maxV - minV))

# replaces Nan by the avg
df_test['age'].fillna(df_test['age'].mean(skipna=True))

The possible values for `numberChildren` are `'1'`, `'2'`, `'3'`, `'4'`, `'4+'` and `'NaN'`. We will be creating a column `nbChildren` of type `int` where `'NaN'` will be mapped to `None`.

In [None]:
def childrenMagik(children: str) -> float:
    if children == '0': return 0
    if children == '1': return 1 / 5
    if children == '2': return 2 / 5
    if children == '3': return 3 / 5
    if children == '4': return 4 / 5
    if children == '4+': return 1
    return None

df_test['nbChildren'] = df_test['numberChildren'].apply(childrenMagik)

# Replacing nan with the avg
df_test['nbChildren'] = df_test['nbChildren'].fillna(df_test['nbChildren'].mean(skipna=True))

df_test = df_test.drop(columns=['numberChildren'])

Replaces `F` with `1` and `M` with `-1` in the `gender` column

In [None]:
df_test['gender'] = df_test['gender'].apply(lambda e: 1 if e == "F" else -1)

Changes `ecoInterest` replacing `High`, `Medium`, `Low`, `Very high` with a scale going from `0` to `1`. We the replace `nan` with the avg.

In [None]:
def textScaleToFloat(s:str) -> float:
    if s == 'Very high': return 1
    if s == 'High': return 2 / 3
    if s == 'Medium': return 1 / 3
    if s == 'Low': return 0
    return None

# replaces the 'normal' values with floats
df_test['ecoInterest'] = df_test['ecoInterest'].apply(textScaleToFloat)

# replaces the nan with the avg
df_test['ecoInterest'] = df_test['ecoInterest'].fillna(df_test['ecoInterest'].mean(skipna=True))

Changes `scentLover` replacing `NonUser`, `Low`, `Medium`, `High` with a scale going from 0 to 1. We the replace `nan` with the avg.

In [None]:
def textScaleToFloat(s:str) -> float:
    if s == 'High': return 1
    if s == 'Medium': return 2 / 3
    if s == 'Low': return 1 / 3
    if s == 'NonUser': return 0
    return None

# replaces the 'normal' values with floats
df_test['scentLover'] = df_test['scentLover'].apply(textScaleToFloat)

# replaces the nan with the avg
df_test['scentLover'] = df_test['scentLover'].fillna(df_test['scentLover'].mean(skipna=True))

Handles the closest shop

In [None]:
df_test['magasin']           = df_test['closestShop'].apply(lambda e: 1 if e =='1. Magasin de Proximit�' else -1)
df_test['moyenneSurface']    = df_test['closestShop'].apply(lambda e: 1 if e =='2. Moyenne Surface' else -1)
df_test['superMarket']       = df_test['closestShop'].apply(lambda e: 1 if e =='3. SuperMarket' else -1)
df_test['hyperMarket']       = df_test['closestShop'].apply(lambda e: 1 if e =='4. HyperMarket' else -1)
df_test['drive']             = df_test['closestShop'].apply(lambda e: 1 if e =='5. Drive' else -1)
df_test['hardDiscount']      = df_test['closestShop'].apply(lambda e: 1 if e =='6. Hard Discounter' else -1)

# removes extra column
df_test = df_test.drop(columns=['closestShop'])

In [None]:
df_test.to_csv('./DS_CentraleSupelec_ST42021/clean.csv')  

In order to have more information about customers, we are going to add features, using the data set "consumer actions"


In [None]:
df_test_test = df_test