In [1]:
import pandas as pd
import numpy as np

### Load in Customer Data

In [245]:
customers = pd.read_csv('data/MFG-customers.csv')
customers = customers.loc[:, ['CustomerID', 'City', 'State', 'Country']]
customers['Location'] = customers['State'].fillna(customers['Country']).apply(lambda x: str(x).upper())
customers['City'] = customers['City'].apply(lambda x: str(x).upper())
customers.drop(columns=['State', 'Country'], inplace=True)

### Load in Accounts Data

In [238]:
accounts = pd.read_csv('data/MFG-accounts.csv', usecols=['Account_ID', 'Revenue_Class'])

### Load in Invoices Data

In [78]:
invoices = pd.read_csv('data/MFG-invoices.csv', parse_dates=True)
invoices['Date'] = invoices['Date'].apply(pd.Timestamp)
invoices['Month'] = invoices['Date'].apply(lambda x: x.month)
invoices['Year'] = invoices['Date'].apply(lambda x: x.year)
invoices['Day'] = invoices['Date'].apply(lambda x: x.day)


### Load in Industry + Press Release Data

In [83]:
industry = pd.read_csv('data/MFG-industry.csv')
press_release = pd.read_csv('data/Press_Release.csv')

### Create Target Variable

In [222]:
n_days = 90
min_diff_days = 3
repeat=[]
for invoice in invoices.iterrows():
    repeat.append((
        (invoice[1]['CustID'] == invoices['CustID']) 
        & (invoice[1]['Date'] < invoices['Date']) 
        & (invoices['Date'] - invoice[1]['Date'] < pd.Timedelta(n_days, 'd'))
        & (invoices['Date'] - invoice[1]['Date'] > pd.Timedelta(min_diff_days, 'd'))
    ).any())  

### Feature Engineering

In [223]:
first_invoice_date = invoices.groupby('CustID')['Date'].min()
customers['First_Invoice'] = first_invoice_date[customers['CustomerID']].values

# days_since_last = invoices.groupby('CustID').apply(lambda x: np.diff(pd.Series.sort_values(x['Date']).values))


### Merge Data

In [246]:
X = invoices.merge(customers, how='left', left_on=['CustID'], right_on=['CustomerID'])
X = X.merge(accounts, how='left', on=['Account_ID'])
X.drop(columns=['CustID', 'CustomerID'], inplace=True)
y = pd.Series(repeat, dtype=int)

In [247]:
cutoff_date = X['Date'].max() - pd.Timedelta(n_days, 'd')

Since we don't have enough data after this date, we can't include it in our training set.

In [248]:
y = y[X['Date'] < cutoff_date]
X = X[X['Date'] < cutoff_date]

### Split Data

In [249]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [250]:
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [260]:
def clean_nan(X):
    X['Item_ID'] = X['Item_ID'].astype(str)
    X['Revenue_Class'] = X['Revenue_Class'].astype(str)
    X.dropna(inplace=True)
    return X

In [261]:
clean_nan(X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Date,Qty,Unit_Price,Item_ID,Account_ID,Amount,Month,Year,Day,City,Location,Revenue_Class
8343,2017-11-08,1,250,RP-RTV-1,200-4042,-250,11,2017,8,ATTLEBORO,MA,Molding
1770,2016-01-20,1,80,RP-FDM-1,200-4011,-80,1,2016,20,HIGLEY,AZ,3D Printing
8256,2017-10-30,1,115,RP-SLS-1,200-4050,-115,10,2017,30,MESA,AZ,3D Printing
3398,2016-07-29,12,20,RP-OBJ-1,200-4055,-240,7,2016,29,PHOENIX,AZ,3D Printing
1644,2016-01-06,100,86,RP-1859701,200-4041,-8600.00,1,2016,6,DUSSELDORF,GERMANY,Molding
671,2015-09-24,2,30,RP-SLA-1,200-4040,-60,9,2015,24,MILWAUKEE,WI,3D Printing
8101,2017-10-19,250,1.32,RP-C10049,200-4041,-330,10,2017,19,IRVINE,CA,Molding
11995,2018-11-30,2,30,RP-SLA-1,200-4040,-60,11,2018,30,PHOENIX,AZ,3D Printing
12094,2018-11-30,1,70,RP-SLA-1,200-4040,-70,11,2018,30,MESA,AZ,3D Printing
5694,2017-03-06,1,115,RP-OBJ-1,200-4055,-115,3,2017,6,TEMPE,AZ,3D Printing


In [263]:
encoder = OneHotEncoder(categories=['Item_ID', 'Month', 'Year', 
                                    'Location', 'Revenue_Class',
                                    'City', 'Item_ID', 'Account_ID'],
                        handle_unknown='ignore')
encoder.fit_transform(X_train)

Automatic pdb calling has been turned OFF


ValueError: Shape mismatch: if n_values is an array, it has to be of shape (n_features,).

In [220]:
X['Revenue_Class'].value_counts(dropna=False)

3D Printing      9904
Molding          1025
Freight           691
Scanning          536
Other             163
Consulting         74
Machine Sales       2
NaN                 1
Name: Revenue_Class, dtype: int64