<h4>Required Libs</h4>

In [1]:
import pandas as pd 
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import mean_absolute_error, mean_squared_error , r2_score ,accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import joblib

<h4>Data cleaning and processing methods</h4>

In [2]:
# Method to process dd/mm/yyyy hh:mm date format to standard date as dd-mm-yyyy
def processDateTime(x):
    splitDate = list(map(int, x.split(' ')[0].split('/'))) 
    return datetime.date(splitDate[2], splitDate[0], splitDate[1])

<h4>Data Cleaning and processing</h4>

In [3]:
# Reading CSV
df = pd.read_csv('data.csv',encoding = 'unicode_escape')
# Replacing empty CustomerID with nan
df['CustomerID'].replace('', np.nan, inplace=True)
# Replacing empty Description with nan
df['Description'].replace('', np.nan, inplace=True)
# droping all rows with NA
df.dropna()
# formatting date
df['InvoiceDate'] = df['InvoiceDate'].apply(lambda x : processDateTime(x))
# Displaying df
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01,3.39,17850.0,United Kingdom


<h5>Data fomatting methods</h5>

In [20]:
## Method to get the dataframe groupedBy customerId and format the data
def formatData(df):
    
  customer_eligibility_data = [] 
 
  for name, group in df:
    invoiceDates = list(set(group['InvoiceDate']))
    purchase_streek = 0

    for prevDate, currentDate in zip(invoiceDates, invoiceDates[1:]):
       invoice_date_diff = currentDate - prevDate 
       if(invoice_date_diff.days < 15):
            purchase_streek += 1

    customer_eligibility_data.append({ 
        "customerId" : name, 
        "purchase_by_invoice" : len(set(group['InvoiceNo'])), 
        "purchase_total_amount" : abs(sum(group['UnitPrice'])),
        "purchase_streek" : purchase_streek,
        "purchase_total_product_quantity" : abs(sum(group['Quantity'])),
        "is_eligible_customer" : 1 if (purchase_streek > 0 and abs(sum(group['UnitPrice'])) > 150) else 0,
        "offer_eligibility" : purchase_streek * 2 if (purchase_streek > 0) else 0
    })

## and abs(sum(group['UnitPrice'])) > 150

  return pd.DataFrame(columns=['customerId','purchase_by_invoice','purchase_total_amount','purchase_streek','purchase_total_product_quantity','is_eligible_customer','offer_eligibility'], data=customer_eligibility_data)

<h5>Data Formatting</h5>

In [41]:
df_grouped_by_customer_id = df.groupby('CustomerID')

df_formatted = pd.read_csv('formatted_data.csv',encoding = 'unicode_escape')

df_purchase_count_based_offer_eligibity = formatData(df_grouped_by_customer_id)

df_purchase_count_based_offer_eligibity = pd.concat([df_purchase_count_based_offer_eligibity, df_formatted])

display(df_purchase_count_based_offer_eligibity)

Unnamed: 0,customerId,purchase_by_invoice,purchase_total_amount,purchase_streek,purchase_total_product_quantity,is_eligible_customer,offer_eligibility
0,12346.0,2,2.08,0,0,0,0
1,12347.0,7,481.21,3,2458,1,6
2,12348.0,4,178.71,1,2341,1,2
3,12349.0,1,605.10,0,631,0,0
4,12350.0,1,65.30,0,197,0,0
...,...,...,...,...,...,...,...
993,cuts1994,29,1413.00,199,26,1,398
994,cuts1995,27,995.00,199,22,1,398
995,cuts1996,15,885.00,200,25,1,400
996,cuts1997,21,1187.00,200,15,1,400


<h5>Segregating Feature and Target parameters</h5>

In [42]:
## Learning parameters
feature_variables = df_purchase_count_based_offer_eligibity[['purchase_by_invoice','purchase_total_amount','purchase_streek', 'purchase_total_product_quantity','is_eligible_customer']]
## offer_eligibility as target parameter
target_variables = df_purchase_count_based_offer_eligibity[['offer_eligibility']]

<h5>Using chi2 test to select k best features for training model</h5>

In [9]:
best_features= SelectKBest(score_func=chi2, k=2)
fit= best_features.fit(feature_variables,target_variables)

df_scores= pd.DataFrame(fit.scores_)
df_columns= pd.DataFrame(feature_variables.columns)

features_scores= pd.concat([df_columns, df_scores], axis=1)
features_scores.columns= ['Features', 'Score']
features_scores.sort_values(by = 'Score')

Unnamed: 0,Features,Score
4,is_eligible_customer,1680.701
0,purchase_by_invoice,58936.18
2,purchase_streek,89191.39
1,purchase_total_amount,5606084.0
3,purchase_total_product_quantity,15307350.0


<h5>Methods to Train / Generate / Predict and Determine Accuracy of classifiers (Decision Tree Classifier)</h5>

In [71]:
# Method to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=3, min_samples_leaf=5)
    # Performing training
    clf_gini.fit(X_train, y_train)
    # returning model
    return clf_gini
      
# Method to perform training with entropy and generate classifier model
def train_using_entropy(X_train, X_test, y_train):
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth = 3, min_samples_leaf = 5)
    # Performing training
    clf_entropy.fit(X_train, y_train)
    # returning model
    return clf_entropy

# Method to perform training with random forest classifier and generate model
def train_using_random_forest(x_train, x_test, y_train):
    model = RandomForestRegressor(n_estimators=500, random_state=42, min_samples_split=2, min_samples_leaf=1, max_depth=10, bootstrap=True)
    model.fit(x_train, y_train)
    return model

# Method to calculate accuracy of decision tree classifiers
def cal_accuracy_of_dt_classifier(classifier_name,y_test, y_pred):   
    #print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
    print(f"\n========================= {classifier_name} ==================================\n")
    print (f"Accuracy of {classifier_name} ", accuracy_score(y_test,y_pred)*100)  
    #print("Report : ", classification_report(y_test, y_pred))

# Method to calculate accuracy of random forest classifiers
def cal_accuracy_of_rf_classifier(classifier_name, x_test, x_train, y_test, y_pred, model):
    print(f"\n============================= {classifier_name} ==============================\n")
    print(f"Mean Absolute Error of {classifier_name}:", round(mean_absolute_error(y_test, y_pred), 4))
    print(f"Mean Squared Error of {classifier_name}:", round(mean_squared_error(y_test, y_pred), 4))
    print(f"Root Mean Squared Error of {classifier_name}:", round(np.sqrt(mean_squared_error(y_test, y_pred)), 4))
    print(f"(R^2) Score of {classifier_name}:", round(r2_score(y_test, y_pred), 4))
    print(f'Train Score of {classifier_name}: {model.score(x_train, y_train) * 100:.2f}% and Test Score : {model.score(x_test, y_test) * 100:.2f}% using Random Tree Regressor.')

# Method to make predictions
def prediction(X_test, clf_object):
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    # returning predicted outputs
    return y_pred 

<h5>Training and generating models and determining accuracy</h5>

In [68]:
''' 
         Decision tree classfier with Gini and Entropy
'''

## Spliting dataset into training and test data with test data size as 0.4 and training data size as 0.6
X_train,X_test,y_train,y_test=train_test_split(feature_variables,target_variables,test_size=0.8,random_state=100)

## Generating classifier model with gini
clf_gini = train_using_gini(X_train, X_test, y_train)
# Prediction using gini
y_pred_gini = prediction(X_test, clf_gini)
# calculating efficieny of classifier generated with gini
cal_accuracy_of_dt_classifier("Gini classifier",y_test, y_pred_gini)
## Saving decision tree classifier gini 
joblib.dump(clf_gini, filename="decision_tree_gini_classifier.joblib")

## Generating classifier model with entropy
clf_entropy = train_using_entropy(X_train, X_test, y_train)
# Prediction using entropy
y_pred_entropy = prediction(X_test, clf_entropy)
# calculating efficieny of classifier generate with entropy
cal_accuracy_of_dt_classifier("Entropy classifier",y_test, y_pred_entropy)
## Saving decision tree classifier entropy
joblib.dump(clf_entropy, filename="decision_tree_entropy_classifier.joblib")



Accuracy of Gini classifier  69.71601489757914


Accuracy of Entropy classifier  71.34543761638734


['decision_tree_entropy_classifier.joblib']

<h5>Custom data prediction test</h5>

In [69]:
customer_eligibility_test_data = [
    {
        "customerId" : "12345", 
        "purchase_by_invoice" : 50, 
        "purchase_total_amount" : 5000,
        "purchase_streek" : 100,
        "purchase_total_product_quantity" : 150,
        "is_eligible_customer" : 1
    }
]

test_df = pd.DataFrame(columns=['customerId','purchase_by_invoice','purchase_total_amount','purchase_streek','purchase_total_product_quantity','is_eligible_customer'], data=customer_eligibility_test_data)

test_df = test_df.drop(columns=['customerId'])

#test_df = scale.transform(test_df.values)

prediction(test_df, clf_entropy)

array([234])

In [None]:
df = df_grouped_by_customer_id.get_group(18283.0)
price_spend_by_date = {}

for x in list(set(df['InvoiceDate'])):
   price_spend_by_date[x] = sum(list(df.loc[df['InvoiceDate'] == x, 'UnitPrice']))

price_spend_by_date





{datetime.date(2011, 1, 6): 100.94999999999996,
 datetime.date(2011, 11, 30): 85.07999999999997,
 datetime.date(2011, 4, 21): 82.82,
 datetime.date(2011, 12, 6): 65.37999999999997,
 datetime.date(2011, 9, 5): 65.89,
 datetime.date(2011, 7, 14): 92.75999999999999,
 datetime.date(2011, 5, 23): 75.23999999999998,
 datetime.date(2011, 11, 10): 105.65,
 datetime.date(2011, 1, 23): 83.25000000000001,
 datetime.date(2011, 6, 14): 49.029999999999994,
 datetime.date(2011, 11, 23): 128.02999999999992,
 datetime.date(2011, 6, 23): 132.82,
 datetime.date(2011, 10, 27): 83.48,
 datetime.date(2011, 2, 28): 70.55000000000003}

In [72]:
## Spliting dataset into training and test data with test data size as 0.4 and training data size as 0.6
x_train, x_test, y_train, y_test = train_test_split(feature_variables.values, target_variables.values, test_size=0.5, random_state=0)

## Standardizing input
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)

joblib.dump(scale, 'std_scaler.bin', compress=True)

## Generating classifier model with random forest
clf_rf = train_using_random_forest(x_train, x_test, y_train)
# Prediction using random forest
y_pred_rf = prediction(x_test, clf_rf)

# calculating efficieny of classifier generate with random forest
cal_accuracy_of_rf_classifier("Random Forest classifier", x_test, x_train, y_test, y_pred_rf, clf_rf)
## Saving decision tree classifier entropy 
joblib.dump(clf_rf, filename="random_forest_classifier.joblib")

  model.fit(x_train, y_train)




Mean Absolute Error of Random Forest classifier: 0.0698
Mean Squared Error of Random Forest classifier: 0.0543
Root Mean Squared Error of Random Forest classifier: 0.2329
(R^2) Score of Random Forest classifier: 1.0
Train Score of Random Forest classifier: 100.00% and Test Score : 100.00% using Random Tree Regressor.


['random_forest_classifier.joblib']

In [84]:
scale = joblib.load('std_scaler.bin')

customer_eligibility_test_data = [
    {
        "customerId" : "12345", 
        "purchase_by_invoice" : 18, 
        "purchase_total_amount" : 400,
        "purchase_streek" : 23,
        "purchase_total_product_quantity" : 10,
        "is_eligible_customer" : 1
    }
]

test_df = pd.DataFrame(columns=['customerId','purchase_by_invoice','purchase_total_amount','purchase_streek','purchase_total_product_quantity','is_eligible_customer'], data=customer_eligibility_test_data)

test_df = test_df.drop(columns=['customerId'])

test_df = scale.transform(test_df.values)

prediction(test_df, clf_rf)

array([46.188])