In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import *
from datetime import datetime, timedelta, date
from dateutil import relativedelta as rd

In [2]:
# Upload DB
orders = pd.read_csv("orders_20190822.csv",sep=";",decimal=",",parse_dates=["o_date"])

In [3]:
list(orders.columns)

['id_o', 'user_id', 'price', 'o_date']

In [4]:
# Rename columns
orders = orders.rename(columns={'id_o': 'id', 'user_id': 'user', 'o_date': 'date'})
list(orders.columns)

['id', 'user', 'price', 'date']

In [5]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2002804 entries, 0 to 2002803
Data columns (total 4 columns):
id       int64
user     int64
price    float64
date     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 61.1 MB


In [6]:
orders["date"].max()

Timestamp('2017-12-31 00:00:00')

In [7]:
orders["price"].max()

819096.6

In [8]:
orders["date"].min()

Timestamp('2016-01-01 00:00:00')

In [9]:
orders["price"].min()

-184.8

In [10]:
# We translate into date format and delete fake orders
orders['date'] = pd.to_datetime(orders.date)
orders = orders[~(orders['price']<=99) & ~(orders['price']>=300000)]

In [11]:
price_date = orders[["price","date"]]
price_date.head()

Unnamed: 0,price,date
0,539.0,2016-01-01
1,153.3,2016-01-01
3,752.5,2016-01-01
4,4410.0,2016-01-01
5,595.0,2016-01-01


In [12]:
# Settings
period_pred = (datetime(2017, 12, 1), datetime(2017, 12, 31)) #Period
days_lost = timedelta(180) # The number of days after which we think that the user is lost
CC_pred = 0 # Planned turnover
order_price_mean = orders['price'].mean()

In [13]:
FACT = orders[orders['date'].between(*period_pred)]['price'].sum()
FACT

322926463.3

In [14]:
# Active users who are not considered lost
active_users = orders.groupby('user').filter(lambda o: o['id'].count() >= 3 and o['date'].max() > period_pred[0] - days_lost)

In [None]:
# Users who were active but stopped buying
lost_users = orders.groupby('user').filter(lambda o: o['id'].count() >= 3 and o['date'].max() <= period_pred[0] - days_lost)

In [None]:
# Users who made only 1 order
one_users = orders.groupby('user').filter(lambda o: o['id'].count() == 1)

In [None]:
# Users who made only 2 orders
two_users = orders.groupby('user').filter(lambda o: o['id'].count() == 2)

In [None]:
# Calculation check
len(one_users) + len(two_users) + len(active_users) + len(lost_users) == len(orders)