In [1]:
# load in packages
from itertools import combinations

from test_results import test_results, score
import numpy as np
import pandas as pd


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import resample

# load in the data
train_data = pd.read_csv('./data/training.csv')
train_data.head()

Unnamed: 0,ID,Promotion,purchase,V1,V2,V3,V4,V5,V6,V7
0,1,No,0,2,30.443518,-1.165083,1,1,3,2
1,3,No,0,3,32.15935,-0.645617,2,3,2,2
2,4,No,0,2,30.431659,0.133583,1,1,4,2
3,5,No,0,0,26.588914,-0.212728,2,1,4,2
4,8,Yes,0,3,28.044332,-0.385883,1,1,2,2


In [2]:
train_data.groupby(['Promotion', 'purchase']).size()

Promotion  purchase
No         0           41851
           1             319
Yes        0           41643
           1             721
dtype: int64

In [3]:
train_data.purchase.value_counts()

0    83494
1     1040
Name: purchase, dtype: int64

In [4]:
def upsample(df, column):
    
    # Up-sample Minority Class approach from Elite Data Science 
    # https://elitedatascience.com/imbalanced-classes


    # Seperate majority and minority classes
    df_majority = df[df[column] == 0]
    df_minority = df[df[column] == 1]
    
    majority_n_samples = df[column].value_counts()[0]
    
    # Upsample minority class
    df_minority_upsampled = resample(df_minority,
                                replace=True,
                                n_samples=majority_n_samples,
                                random_state=42)

    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])
    
    return df_upsampled

df_upsampled = upsample(train_data, 'purchase')
df_upsampled.purchase.value_counts()

1    83494
0    83494
Name: purchase, dtype: int64

In [5]:
labels = train_data.columns[3:].tolist()

print("labels: {}".format(labels))


# seperate features from target variable
X_upsampled = df_upsampled[labels].values
y_upsampled = df_upsampled['purchase'].values

# initialize the model
model = GradientBoostingClassifier()

# fit the model
model = model.fit(X_upsampled,y_upsampled)

labels: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7']


In [6]:
def promotion_strategy(df):
    '''
    INPUT 
    df - a dataframe with *only* the columns V1 - V7 (same as train_data)

    OUTPUT
    promotion_df - np.array with the values
                   'Yes' or 'No' related to whether or not an 
                   individual should recieve a promotion 
                   should be the length of df.shape[0]
                
    Ex:
    INPUT: df
    
    V1	V2	  V3	V4	V5	V6	V7
    2	30	-1.1	1	1	3	2
    3	32	-0.6	2	3	2	2
    2	30	0.13	1	1	4	2
    
    OUTPUT: promotion
    
    array(['Yes', 'Yes', 'No'])
    indicating the first two users would recieve the promotion and 
    the last should not.
    
    '''
    
    promotion = pd.Series(model.predict(df)).map({0:'No', 1:'Yes'}).values
    
    return promotion

In [7]:
test_results(promotion_strategy)

Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.02.

Your nir with this strategy is 441.40.
Approximately, the highest scores obtained at Udacity were: irr of 0.1 and an nir of 300.

 How did you do?


(0.02028405881158077, 441.4000000000001)