# **This is the collaborative code for the group project QTEM Data Challenge**

# ____ __________________ _________ _______ __________ ______________ 

## 1. Install and import packages

In [15]:
from pandas_profiling import ProfileReport
import numpy as np
import scipy as sp
from scipy import linalg
from scipy import optimize
from scipy import interpolate
import sympy as smp
from scipy.special import erfinv
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Logit, Probit, MNLogit
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings('ignore')

## 2. Importing data and defining datasets

In [16]:
%%time
df_Calls = pd.read_csv("b. CARTIER_CALLS.csv")
df_Clienteling = pd.read_csv("c. CARTIER_CLIENTELING.csv")
df_Livechat = pd.read_csv("d. CARTIER_LIVECHAT.csv")
df_PrevSales = pd.read_csv("e. CARTIER_PREVIOUS_SALES.csv")
df_Sales = pd.read_csv("f. CARTIER_SALES.csv")
df_Wishlist = pd.read_csv("g. CARTIER_WISHLIST.csv")

Wall time: 13.7 s


## Merging datasets

In [17]:
# Merged sales and previous sales dataset with extra column [in_salesdataset] to indicate to which original dataset 
# the data belong

df_Sales['in_salesdataset']=1
df_PrevSales['in_salesdataset']=0

# Rename column from Sale for the concat of datasets

df_Sales.rename(columns = {'articleA':'ArticleA'},inplace = True)

df_AllSales = pd.concat([df_Sales, df_PrevSales])
df_AllSales = df_AllSales.sort_values(by='ClientID', ascending= True)

#to export a dataframe to excel (for siqi): 

#df_BLABLABLA.to_csv('df_BLABLABLA.csv')

In [18]:
# Filter AllSales to drop sales that happened before 2010

df_AllSalesFilt=df_AllSales.drop(df_AllSales[(df_AllSales['TransactionDate_FYYYY']<2010)].index)

In [19]:
df_AllSalesPlot=df_AllSales[df_AllSales.nb_days_since_last_sale>1460]
print(df_AllSalesPlot)

                                  InvoiceHeader   Channel TransactionDate  \
122626                                      NaN  Boutique      2019-12-14   
134321                                      NaN  Boutique      2019-12-14   
125277                                      NaN  Boutique      2019-12-14   
130122                                      NaN  Boutique      2020-01-12   
127757                                      NaN  Boutique      2017-01-07   
...                                         ...       ...             ...   
170014  303707220911003-GEMINI-/BIC/AZRIRBARP00  Boutique      2022-09-11   
450007  303707220911003-GEMINI-/BIC/AZRIRBARP00  Boutique      2022-09-11   
863207  303705200821005-GEMINI-/BIC/AZRIRBARP00  Boutique      2020-08-21   
427661  320403220911002-GEMINI-/BIC/AZRIRBARP00  Boutique      2022-09-11   
424837  321502220911006-GEMINI-/BIC/AZRIRBARP00  Boutique      2022-09-11   

        TransactionDate_FYYYY TransactionCategory            ClientID  \
12

In [20]:
# Filter out AllSales to drop transaction that arent Sales

df_AllSalesFilt=df_AllSalesFilt.drop(df_AllSalesFilt[(df_AllSalesFilt['TransactionCategory']!='Sale')].index)

In [21]:
print(df_AllSalesFilt.TransactionDate_FYYYY.unique())
print(df_AllSalesFilt.shape[0])
print(df_AllSales.shape[0])

[2020 2023 2022 2019 2017 2021 2016 2013 2018 2015 2010 2012 2014 2011]
1569771
2173083


In [22]:
df_AllSales = df_AllSalesFilt

## Renaming and uniformizing some columns

In [23]:
# First let's compare all our column names
datasets = [df_Calls,
df_Clienteling,
df_Livechat,
df_Wishlist,]

# Renaming

df_Clienteling.rename(columns = {'clientID':'ClientID'}, inplace = True)
df_Livechat.rename(columns = {'cLientid':'ClientID'}, inplace = True)
df_Wishlist.rename(columns = {'clientID':'ClientID'}, inplace = True)


#print(list(df_AllSales),"\n\n", list(df_Calls),"\n\n", list(df_Clienteling),"\n\n", list(df_Livechat),"\n\n", list(df_Wishlist))

## Defining a sample dataset

In [24]:
# Define a sample dataset to work with for speed purposes if necessary (3% of size)

#df_AllSalesSample = df_AllSales.head(int(round((len(df_allsales)/(100)),0)))
# How to get a random sample of the dataset ??
#df_AllSalesSample=df_AllSales.sample(frac=0.03)
#print(df_AllSalesSample.head(2))

## 2.1 Cleaning the data

In [25]:
# Remove duplicates
df_AllSalesWithDuplicates = df_AllSales
total_rows_before = df_AllSalesWithDuplicates.shape[0]
total_rows_before

# We have 1 111 279 rows in All_Sales dataset with duplicate data

1569771

In [26]:
# Remove duplicates

df_AllSales.drop_duplicates(inplace=True)

total_rows_after = df_AllSales.shape[0]
duplicates_count = total_rows_before - total_rows_after
duplicates_count

71

In [27]:
# Drop irrelevant columns

df_AllSales = df_AllSales.drop(['Hier_Lev_3_txtA', 'Hier_Lev_4_txtA', 'Hier_Lev_5_txtA'], axis=1)

In [28]:
# Missing values

df_AllSales.isna().sum()

InvoiceHeader                 5582
Channel                          0
TransactionDate                  0
TransactionDate_FYYYY            0
TransactionCategory              0
ClientID                         0
AgeAtTransaction            638287
Gender                        1813
PersonBirthDate             528936
WeddingDate                1407946
SpokenLanguage                1334
WrittenLanguage               1334
FirstSalesDate                 445
FirstTransactionDate           245
ProductCategory               1177
ProductSubCategory            1177
ProductFunction               1177
Turnover                     74074
quantity                         0
seq_sales_trs                    0
nb_days_since_last_sale     458147
PurchasedMarketA                 0
PurchasedRegionA                 0
ResidencyRegionA                 0
ResidencyMarketA                 0
BoutiqueNameA               513011
ResidencyCountryA                6
nationalityA               1358025
ArticleA            

In [36]:
# Replace Nans in n_days_since_last_sale by 0 if Transaction date is the same as first sale date

#df.Z=df.apply(lambda row: 0 if row.Z == "NA" and row.X == row.Y else row.Z, axis=1)

df_AllSales.nb_days_since_last_sale=df_AllSales.apply(lambda row: 0 if pd.isnull(row.nb_days_since_last_sale) == True and row.TransactionDate == row.FirstSalesDate else row.nb_days_since_last_sale, axis=1)



In [37]:
df_AllSales[df_AllSales.nb_days_since_last_sale == 0][['nb_days_since_last_sale','FirstSalesDate','TransactionDate']]

Unnamed: 0,nb_days_since_last_sale,FirstSalesDate,TransactionDate
7646,0.0,2016-12-10,2016-12-10
129793,0.0,2020-01-08,2020-01-08
19664,0.0,2015-09-26,2015-09-26
18604,0.0,2015-09-26,2015-09-26
113964,0.0,2019-03-05,2019-03-05
...,...,...,...
98020,0.0,2017-08-27,2017-08-27
357224,0.0,2022-07-18,2022-07-18
1468692,0.0,2022-06-22,2022-06-22
92232,0.0,2020-07-22,2020-07-22


In [38]:
df_AllSales.isna().sum()

InvoiceHeader                 5582
Channel                          0
TransactionDate                  0
TransactionDate_FYYYY            0
TransactionCategory              0
ClientID                         0
AgeAtTransaction            638287
Gender                        1813
PersonBirthDate             528936
WeddingDate                1407946
SpokenLanguage                1334
WrittenLanguage               1334
FirstSalesDate                 445
FirstTransactionDate           245
ProductCategory               1177
ProductSubCategory            1177
ProductFunction               1177
Turnover                     74074
quantity                         0
seq_sales_trs                    0
nb_days_since_last_sale      17554
PurchasedMarketA                 0
PurchasedRegionA                 0
ResidencyRegionA                 0
ResidencyMarketA                 0
BoutiqueNameA               513011
ResidencyCountryA                6
nationalityA               1358025
ArticleA            

In [39]:
filta = df_AllSales['nb_days_since_last_sale'] > 1460
df_AllSales[filta]['nb_days_since_last_sale'].value_counts().sum()

101384

In [13]:
# Changin DataTypes

df_AllSales['Turnover'] = pd.to_numeric(df_AllSales.Turnover, errors='coerce')
df_AllSales.Turnover.apply(np.round).astype('Int64')
#df_AllSales['AgeAtTransaction'] = df_AllSales['AgeAtTransaction'].astype(int)

122626       66
134321       81
9874        447
125277      480
22358      8205
           ... 
1468692    6368
17119      5525
92232      4709
350821     4023
252156      273
Name: Turnover, Length: 2169246, dtype: Int64

## 3. Quick look at our data

In [14]:
# To see the columns we have 

# Run this line if you wish to be able to visualize all the columns whenever you print out the dataframe

pd.set_option('max_columns', None)

print(list(df_AllSales))
#print(list(anything else you want to see))

df_AllSales.describe()
df_AllSales.info()
df_AllSales.isna().sum()

['InvoiceHeader', 'Channel', 'TransactionDate', 'TransactionDate_FYYYY', 'TransactionCategory', 'ClientID', 'AgeAtTransaction', 'Gender', 'PersonBirthDate', 'WeddingDate', 'SpokenLanguage', 'WrittenLanguage', 'FirstSalesDate', 'FirstTransactionDate', 'ProductCategory', 'ProductSubCategory', 'ProductFunction', 'Turnover', 'quantity', 'seq_sales_trs', 'nb_days_since_last_sale', 'PurchasedMarketA', 'PurchasedRegionA', 'ResidencyRegionA', 'ResidencyMarketA', 'BoutiqueNameA', 'ResidencyCountryA', 'nationalityA', 'ArticleA', 'ProductCollectionA', 'in_salesdataset', 'NationalityA']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2169246 entries, 122626 to 252156
Data columns (total 32 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   InvoiceHeader            object 
 1   Channel                  object 
 2   TransactionDate          object 
 3   TransactionDate_FYYYY    int64  
 4   TransactionCategory      object 
 5   ClientID                 objec

InvoiceHeader                 6542
Channel                          0
TransactionDate                  0
TransactionDate_FYYYY            0
TransactionCategory              0
ClientID                         0
AgeAtTransaction            921604
Gender                        2549
PersonBirthDate             764663
WeddingDate                1948156
SpokenLanguage                2632
WrittenLanguage               2632
FirstSalesDate                1054
FirstTransactionDate           407
ProductCategory              35554
ProductSubCategory           35554
ProductFunction              35554
Turnover                     80314
quantity                         0
seq_sales_trs                    0
nb_days_since_last_sale          0
PurchasedMarketA                 0
PurchasedRegionA                 0
ResidencyRegionA                 0
ResidencyMarketA                 0
BoutiqueNameA               617078
ResidencyCountryA                6
nationalityA               1876584
ArticleA            

## 4. Generating the report

In [15]:
### We can generate a profile to obtain description of our datasets
### This can take some time if running report on large datasets
### We can run the report on our sample dataset

# Profile for Sales Sample

#Profile_AllSalesSample = ProfileReport(df_Allsales, title="Pandas Profiling Report Sales")

# Visualizing the report

#Report to iframe (will open report in notebook)

#Profile_AllSalesSample.to_notebook_iframe()

## Report to HTML (Will open up a new tab)
#Profile_AllSalesSample.to_file(output_file='All Sales Sample report')

## 5. Next we would like to create new columns in [Allsales] to identify wether clientID is present in [Calls], [LiveChat], [Clienteling] 

In [45]:
%%time
# #Defining =1 when ClientID is present in the corresponding datasets
df_AllSales = df_AllSales.assign(Calls=df_AllSales.ClientID.isin(df_Calls.ClientID).astype(int))
df_AllSales = df_AllSales.assign(Clienteling=df_AllSales.ClientID.isin(df_Clienteling.ClientID).astype(int))
df_AllSales = df_AllSales.assign(Livechat=df_AllSales.ClientID.isin(df_Livechat.ClientID).astype(int))
df_AllSales = df_AllSales.assign(Wishlist=df_AllSales.ClientID.isin(df_Wishlist.ClientID).astype(int))


print(df_AllSales.Calls.unique(),
      df_AllSales.Clienteling.unique(),
      df_AllSales.Livechat.unique(),
      df_AllSales.Wishlist.unique())

[0 1] [0 1] [0 1] [0 1]
Wall time: 4.46 s


In [20]:
## Useful if were going to filter on Sales dataset for nb_days > 1460 and then apply filter to corresponding clients in prev_sales

df_Sales = df_Sales.assign(Returning=df_Sales.ClientID.isin(df_PrevSales.ClientID).astype(int))

print(df_Sales.Returning.unique())

[1]


# 6. Create variable "repurchase_long" & "repurchage_exact" that will indicates if client repurchased after 4 years or more or after 4 years +/- 2 months

In [40]:
## Code
df_AllSales['repurchase_long']='0'
df_AllSales['repurchase_long'] = np.where(df_AllSales['nb_days_since_last_sale']>=1400 ,1,0)

print(df_AllSales['repurchase_long'].value_counts())

0    1461306
1     108394
Name: repurchase_long, dtype: int64


In [41]:

df_AllSales['repurchase_exact4']='0'
df_AllSales['repurchase_exact4'] = np.where((df_AllSales['nb_days_since_last_sale']>=1400) & (df_AllSales['nb_days_since_last_sale']<=1520) ,1,0)
print(df_AllSales['repurchase_exact4'].value_counts())

0    1556686
1      13014
Name: repurchase_exact4, dtype: int64


In [46]:
# Export new All_Sales dataframe to start new notebook for regressions

df_AllSales.to_csv('df_AllSalesLogit.csv')


## What kind of question can we answer with this data? (Brainstorm needed)
### Were the purchases made right after communication? 
#### Compare date of purchase with date of latest interaction
### Which type of communication caused the most repurchases ? 
#### Can we use turnover to evaluate which type of communication made the most revenues?

 
## Clienteling:
### What kind of activity category is most effective
### Make a ranking by activity category and type
### Any link between activity status and repurchase / sale amount? 

## Wishlist:
### Do clients with a wishlist repurchase after 4 years? 
### Any link between last modified date, created date and purchase date?

## All Sales
### Make a ranking of product categories repurchased after 4 years in order of magnitude
### Make another table with a column to identify if product repurchased is in same category, subcategory,  of first product bought
### Whats up with wedding dates / anniversary dates and repurchase?

## Calls


## Livechat



In [None]:
# Code

# 7. Where we're heading

# What kind of variables can we construct out of our dataset to include in the logistic regression

In [17]:
## New Column: Difference between wedding date and date of purchase
## Plot collections \ prod category with repurchase = 0, 1 ?
## Plot most of our variables before we include them in our regression 
## Create dummy variables allowing us to reduce dataset
        ## Create a number of products purchased by ClientID
## How can we use turnover to deduce meaningful insight
## Clean data for missing value in nb_days_Since_last_Sale NaNs... we can double check this if client is in both datasets

In [33]:
df_AllSales.describe()

Unnamed: 0,TransactionDate_FYYYY,AgeAtTransaction,Turnover,quantity,seq_sales_trs,nb_days_since_last_sale,PurchasedMarketA,PurchasedRegionA,ResidencyRegionA,ResidencyMarketA,BoutiqueNameA,ResidencyCountryA,nationalityA,ArticleA,ProductCollectionA,in_salesdataset,NationalityA,repurchase_long,repurchase_exact4
count,1111208.0,699135.0,1068670.0,1111208.0,1111208.0,816072.0,1111208.0,1111208.0,1111208.0,1111208.0,775260.0,1111202.0,211675.0,1110055.0,1108838.0,1111208.0,421226.0,1111208.0,1111208.0
mean,2020.199,39.575933,4119.382,1.01984,5.529539,570.784758,2.503876,2.174152,2.157427,2.682223,64.372781,53.59783,50.372151,206573.4,59.60033,0.3494269,62.491038,0.08085255,0.009498672
std,3.101523,11.062383,37139.46,1.923368,13.43516,874.900452,3.058985,1.120226,1.190743,3.918234,33.047327,46.98384,46.476591,211977.1,28.66219,0.4767892,41.059827,0.2726087,0.09699719
min,2010.0,18.0,-13705.43,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,2.0,0.0,0.0
25%,2019.0,31.0,273.2958,1.0,1.0,83.0,1.0,1.0,1.0,1.0,34.0,1.0,9.0,100901.0,48.0,0.0,39.0,0.0,0.0
50%,2022.0,38.0,1441.67,1.0,2.0,251.0,2.0,2.0,2.0,2.0,70.0,56.0,44.0,121111.0,57.0,0.0,44.0,0.0,0.0
75%,2022.0,46.0,4041.087,1.0,4.0,672.0,3.0,3.0,3.0,3.0,93.0,110.0,99.0,259367.0,78.0,1.0,99.0,0.0,0.0
max,2023.0,99.0,10867000.0,954.0,336.0,13441.0,34.0,9.0,11.0,32.0,119.0,176.0,191.0,784525.0,105.0,1.0,191.0,1.0,1.0


In [19]:
# Ideas for future

# Investigate variables deeper
# Link between Sale / Repair / Return ? If clients who ask for a return or repair end up rebuying, should Cartier contact
                                                                                                                #them?
    # Create 0-1 variable indicating if client got a repair \ return
    # Plot number of days since last sale
    
# Multiple sorting
# Reducing dataset
# Convert floats to int, datatypes, etc
# Create column that counts sales per client
# Create a column that computes average turnover per client
# As discussed by amber, some clients dont have n_days_since_last_sale >= 1440 but if we sum their n_days_since_last_

## Plotting and Statistics: 

In [None]:
# This filters the dataset

df_AllSalesPlot=df_AllSales[df_AllSales.nb_days_since_last_sale>1460]
print(df_AllSalesRepurchase)

## Logistic Regression

In [42]:
## Data Preparation

#data1=df_AllSales.drop(df_AllSales[(df_AllSales['TransactionCategory']!='Sale')].index
data1 = df_AllSales

In [126]:
#Collect all the needed raw data 
data2=data1[['ClientID','Gender','Channel','AgeAtTransaction','nb_days_since_last_sale','Turnover',
             'TransactionDate','WeddingDate','PersonBirthDate','Calls','Livechat','Clienteling','Wishlist']]
data2=pd.DataFrame(data2)

In [127]:
data2['nb_days_since_last_sale'].value_counts()

0.0        843178
1.0         16810
2.0         12668
3.0         10905
4.0         10271
            ...  
6926.0          1
6927.0          1
6934.0          1
6938.0          1
17486.0         1
Name: nb_days_since_last_sale, Length: 7440, dtype: int64

### Dummy variables creation

In [22]:
# 

#df_LogRegSales = df_AllSales['ClientID', 'Gender','age', 'Channel', 'Calls', 'Livechat', 'Clienteling', 'Wishlist', 'Wedding', 'Birthday']

# y = repurchase [0,1]

# X = ['Gender', 'Channel','Calls', 'Livechat', 'Clienteling', 'Wishlist', 'Wedding', '']

In [None]:
# Process Repurchase as dummy

dummy_repurchase=pd.get_dummies(data2['repurchase'],prefix='repurchase')
data2=data2.join(dummy_repurchase['repurchase_1'])

In [None]:
#Pre-process "Channel" as the second dummy indepedent variable
dummy_Channel=pd.get_dummies(data2['Channel'],prefix='Channel')
data2['Channel_Boutique']=dummy_Channel['Channel_Boutique']
data2['Channel_Web']=dummy_Channel['Channel_Web']
#print(data2)

In [None]:
#Pre-process "Age" as the third indepedent variable through decentralization, #what is the purpose of this intercept?
'''
data2['AgeAtTransaction'].describe()
k=4 
data2['Age_bins'] = pd.cut(data2['AgeAtTransaction'],k,labels=False)
#add constant
data2['intercept']=1.0
data2=data2.dropna(axis=0)
data2.head()
'''

In [None]:
#Recode all variables to calculate date intervals for Logit model (Might need to make it an integer)
# Wedding Date Variable
data2['WeddingYear'] = data2['WeddingDate'].dt.year
data2['WeddingMonth'] = data2['WeddingDate'].dt.month
data2['WeddingDay'] = data2['WeddingDate'].dt.day
# Transaction Date Var
data2['TrxYear'] = data2['TransactionDate'].dt.year
data2['TrxMonth'] = data2['TransactionDate'].dt.month
data2['TrxDay'] = data2['TransactionDate'].dt.day
# PersonBirth Date Var
data2['BirthYear'] = data2['PersonBirthDate'].dt.year
data2['BirthMonth'] = data2['PersonBirthDate'].dt.month
data2['BirthDay'] = data2['PersonBirthDate'].dt.day

In [44]:
df_onlyAllSales = df_AllSales.drop(df_AllSales[(df_AllSales['TransactionCategory']!='Sale')].index)
df_onlyAllSales['repurchase'].value_counts()

KeyError: 'repurchase'

In [None]:
data1=data1.dropna(subset=['nb_days_since_last_sale'])

In [40]:
data1['repurchase'].value_counts()

1    1119493
0     110029
Name: repurchase, dtype: int64

In [44]:
#filt = df_onlyAllSales[]
df_onlyAllSales['nb_days_since_last_sale'].value_counts()

1.0       16810
2.0       12668
0.0       12552
3.0       10905
4.0       10271
          ...  
6786.0        1
6787.0        1
6788.0        1
6789.0        1
8190.0        1
Name: nb_days_since_last_sale, Length: 7261, dtype: int64

In [70]:
#filt_nbnans = df_onlyAllSales['nb_days_since_last_sale'].isna()
df_onlyAllSales['nb_days_since_last_sale'].isna().value_counts()

False    1229522
True      491896
Name: nb_days_since_last_sale, dtype: int64

In [84]:
df_onlyAllSales[filt_nbnans == True]['InvoiceHeader'].value_counts().head(200)

120699000060061-GEMINI-/BIC/AZRIRBAPE00    26
120699000056756-GEMINI-/BIC/AZRIRBAPE00    23
XJP2540293-GEMINI-/BIC/AZRIRBAJP00         19
XJP104062-GEMINI-/BIC/AZRIRBAJP00          17
303710210206005-GEMINI-/BIC/AZRIRBARP00    16
                                           ..
000000201229382-GEMINI-/BIC/AZRIRBARP00     7
000000018839873-GEMINI-/BIC/AZRIRBARP00     7
000000203903255-GEMINI-/BIC/AZRIRBAAP00     7
000000202050477-GEMINI-/BIC/AZRIRBARP00     7
000000202309831-GEMINI-/BIC/AZRIRBARP00     7
Name: InvoiceHeader, Length: 200, dtype: int64

In [77]:
df_nbSalesNans = df_onlyAllSales[filt_nbnans == True]

In [83]:
df_nbSalesNans['ClientID'].value_counts()
df_nbSalesNans.sort_values('ClientID')


df_nbSalesNans[df_nbSalesNans.ClientID == "0011i00001OZyBhAAL"]

Unnamed: 0,InvoiceHeader,Channel,TransactionDate,TransactionDate_FYYYY,TransactionCategory,ClientID,AgeAtTransaction,Gender,PersonBirthDate,WeddingDate,SpokenLanguage,WrittenLanguage,FirstSalesDate,FirstTransactionDate,ProductCategory,ProductSubCategory,ProductFunction,Turnover,quantity,seq_sales_trs,nb_days_since_last_sale,PurchasedMarketA,PurchasedRegionA,ResidencyRegionA,ResidencyMarketA,BoutiqueNameA,ResidencyCountryA,nationalityA,ArticleA,Hier_Lev_3_txtA,Hier_Lev_4_txtA,Hier_Lev_5_txtA,ProductCollectionA,in_salesdataset,NationalityA,Calls,Clienteling,Livechat,Wishlist,repurchase
18802,120699000060061-GEMINI-/BIC/AZRIRBAPE00,Boutique,2022-02-18,2022,Sale,0011i00001OZyBhAAL,,Male,,,Chinese,Chinese,2022-02-18,2022-02-18,JEWELRY,BIJOUX,RING,1483.33,1,1,,4,4,6,16,53.0,112.0,,108490.0,5.0,157.0,539.0,57.0,0,44.0,0,0,0,0,0
18801,120699000060061-GEMINI-/BIC/AZRIRBAPE00,Boutique,2022-02-18,2022,Sale,0011i00001OZyBhAAL,,Male,,,Chinese,Chinese,2022-02-18,2022-02-18,JEWELRY,NJ,NECK./PEND.,4875.0,1,1,,4,4,6,16,53.0,112.0,,529671.0,61.0,36.0,242.0,13.0,0,44.0,0,0,0,0,0
1234,120699000060061-GEMINI-/BIC/AZRIRBAPE00,Boutique,2022-02-18,2022,Sale,0011i00001OZyBhAAL,,Male,,,Chinese,Chinese,2022-02-18,2022-02-18,JEWELRY,BIJOUX,RING,1108.33,1,1,,4,4,6,16,53.0,112.0,,100899.0,5.0,157.0,945.0,104.0,0,44.0,0,0,0,0,0
12994,120699000060061-GEMINI-/BIC/AZRIRBAPE00,Boutique,2022-02-18,2022,Sale,0011i00001OZyBhAAL,,Male,,,Chinese,Chinese,2022-02-18,2022-02-18,JEWELRY,BIJOUX,NECK./PEND.,2958.33,1,1,,4,4,6,16,53.0,112.0,,123153.0,5.0,43.0,137.0,74.0,0,44.0,0,0,0,0,0
4466,120699000060061-GEMINI-/BIC/AZRIRBAPE00,Boutique,2022-02-18,2022,Sale,0011i00001OZyBhAAL,,Male,,,Chinese,Chinese,2022-02-18,2022-02-18,JEWELRY,BIJOUX,RING,958.33,1,1,,4,4,6,16,53.0,112.0,,108634.0,5.0,157.0,539.0,57.0,0,44.0,0,0,0,0,0
17384,120699000060061-GEMINI-/BIC/AZRIRBAPE00,Boutique,2022-02-18,2022,Sale,0011i00001OZyBhAAL,,Male,,,Chinese,Chinese,2022-02-18,2022-02-18,JEWELRY,BIJOUX,BRACELET,5625.0,1,1,,4,4,6,16,53.0,112.0,,121084.0,5.0,157.0,539.0,57.0,0,44.0,0,0,0,0,0
117134,120699000060061-GEMINI-/BIC/AZRIRBAPE00,Boutique,2022-02-18,2022,Sale,0011i00001OZyBhAAL,,Male,,,Chinese,Chinese,2022-02-18,2022-02-18,JEWELRY,BIJOUX,RING,958.33,1,1,,4,4,6,16,53.0,112.0,,108622.0,5.0,157.0,539.0,57.0,0,44.0,0,0,0,0,0
132171,120699000060061-GEMINI-/BIC/AZRIRBAPE00,Boutique,2022-02-18,2022,Sale,0011i00001OZyBhAAL,,Male,,,Chinese,Chinese,2022-02-18,2022-02-18,JEWELRY,BIJOUX,NECK./PEND.,3583.33,1,1,,4,4,6,16,53.0,112.0,,122809.0,5.0,157.0,539.0,57.0,0,44.0,0,0,0,0,0
120067,120699000060061-GEMINI-/BIC/AZRIRBAPE00,Boutique,2022-02-18,2022,Sale,0011i00001OZyBhAAL,,Male,,,Chinese,Chinese,2022-02-18,2022-02-18,JEWELRY,BIJOUX,RING,1041.67,1,1,,4,4,6,16,53.0,112.0,,110942.0,5.0,157.0,945.0,104.0,0,44.0,0,0,0,0,0
21332,120699000060061-GEMINI-/BIC/AZRIRBAPE00,Boutique,2022-02-18,2022,Sale,0011i00001OZyBhAAL,,Male,,,Chinese,Chinese,2022-02-18,2022-02-18,JEWELRY,BIJOUX,RING,1483.33,1,1,,4,4,6,16,53.0,112.0,,108487.0,5.0,157.0,539.0,57.0,0,44.0,0,0,0,0,0


In [1]:
ClientID_grp = df_nbSalesNans.groupby(['ClientID'])

NameError: name 'df_nbSalesNans' is not defined

In [179]:
df_AllSales.Gender.unique()

array(['Male', 'Female', 'Couple', nan, 'Unknown'], dtype=object)

In [None]:
# Notes from meeting

# figure out why our logit model is wrong
# client model vs transaction model


# better plotting of interesting variables to build a story
# 