<h4>Required Libs</h4>

In [42]:
import pandas as pd 
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import joblib

<h4>Data cleaning and processing methods</h4>

In [43]:
# Method to process dd/mm/yyyy hh:mm date format to standard date as dd-mm-yyyy
def processDateTime(x):
    splitDate = list(map(int, x.split(' ')[0].split('/'))) 
    return datetime.date(splitDate[2], splitDate[0], splitDate[1])

<h4>Data Cleaning and processing</h4>

In [44]:
# Reading CSV
df = pd.read_csv('data.csv',encoding = 'unicode_escape')
# Replacing empty CustomerID with nan
df['CustomerID'].replace('', np.nan, inplace=True)
# Replacing empty Description with nan
df['Description'].replace('', np.nan, inplace=True)
# droping all rows with NA
df.dropna()
# formatting date
df['InvoiceDate'] = df['InvoiceDate'].apply(lambda x : processDateTime(x))
# Displaying df
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01,3.39,17850.0,United Kingdom


<h5>Data fomatting methods</h5>

In [45]:
## Method to get the dataframe groupedBy customerId and format the data
def formatData(df):
    
  customer_eligibility_data = [] 
 
  for name, group in df:
    invoiceDates = list(set(group['InvoiceDate']))
    purchase_streek = 0

    for prevDate, currentDate in zip(invoiceDates, invoiceDates[1:]):
       invoice_date_diff = currentDate - prevDate 
       if(invoice_date_diff.days < 15):
            purchase_streek += 1

    customer_eligibility_data.append({ 
        "customerId" : name, 
        "purchase_by_invoice" : len(set(group['InvoiceNo'])), 
        "purchase_total_amount" : abs(sum(group['UnitPrice'])),
        "purchase_streek" : purchase_streek,
        "purchase_total_product_quantity" : abs(sum(group['Quantity'])),
        "is_eligible_customer" : 1 if (purchase_streek > 0 and abs(sum(group['UnitPrice'])) > 150) else 0,
        "offer_eligibility" : purchase_streek * 2 if (purchase_streek > 0 and abs(sum(group['UnitPrice'])) > 150) else 0
    })

  return pd.DataFrame(columns=['customerId','purchase_by_invoice','purchase_total_amount','purchase_streek','purchase_total_product_quantity','is_eligible_customer','offer_eligibility'], data=customer_eligibility_data)

<h5>Data Formatting</h5>

In [46]:
df_grouped_by_customer_id = df.groupby('CustomerID')

df_purchase_count_based_offer_eligibity = formatData(df_grouped_by_customer_id)

display(df_purchase_count_based_offer_eligibity)

Unnamed: 0,customerId,purchase_by_invoice,purchase_total_amount,purchase_streek,purchase_total_product_quantity,is_eligible_customer,offer_eligibility
0,12346.0,2,2.08,0,0,0,0
1,12347.0,7,481.21,4,2458,1,8
2,12348.0,4,178.71,2,2341,1,4
3,12349.0,1,605.10,0,631,0,0
4,12350.0,1,65.30,0,197,0,0
...,...,...,...,...,...,...,...
4367,18280.0,1,47.65,0,45,0,0
4368,18281.0,1,39.36,0,54,0,0
4369,18282.0,3,62.68,2,98,0,0
4370,18283.0,16,1220.93,6,1397,1,12


<h5>Using chi2 test to select k best features for training model</h5>

In [47]:
X = df_purchase_count_based_offer_eligibity.iloc[:,1:5]   
Y = df_purchase_count_based_offer_eligibity.iloc[:,-1]

best_features= SelectKBest(score_func=chi2, k=2)
fit= best_features.fit(X,Y)

df_scores= pd.DataFrame(fit.scores_)
df_columns= pd.DataFrame(X.columns)

features_scores= pd.concat([df_columns, df_scores], axis=1)
features_scores.columns= ['Features', 'Score']
features_scores.sort_values(by = 'Score')

Unnamed: 0,Features,Score
2,purchase_streek,27404.76
0,purchase_by_invoice,71019.99
1,purchase_total_amount,7575166.0
3,purchase_total_product_quantity,61603570.0


<h5>Segregating Feature and Target parameters</h5>

In [48]:
## Learning parameters
feature_variables = df_purchase_count_based_offer_eligibity[['purchase_by_invoice','purchase_total_amount','purchase_streek', 'purchase_total_product_quantity','is_eligible_customer']]
## offer_eligibility as target parameter
target_variables = df_purchase_count_based_offer_eligibity[['offer_eligibility']]

<h5>Training and generating classifiers (Decision Tree Classifier)</h5>

In [49]:
## Spliting dataset into training and test data with test data size as 0.4 and training data size as 0.6
X_train,X_test,y_train,y_test=train_test_split(feature_variables,target_variables,test_size=0.4,random_state=100)

# Function to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):
  
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion = "gini",
            random_state = 100,max_depth=3, min_samples_leaf=5)
  
    # Performing training
    clf_gini.fit(X_train, y_train)

    return clf_gini
      
# Function to perform training with entropy and generate classifier model
def train_using_entropy(X_train, X_test, y_train):
  
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(
            criterion = "entropy", random_state = 100,
            max_depth = 3, min_samples_leaf = 5)
  
    # Performing training
    clf_entropy.fit(X_train, y_train)

    return clf_entropy
      
## Generating classifier model with gini
clf_gini = train_using_gini(X_train, X_test, y_train)

## Generating classifier model with entropy
clf_entropy = train_using_entropy(X_train, X_test, y_train)


<h5>Calculating prediction accuracy with generated classifier</h5>

In [50]:
# Function to calculate accuracy
def cal_accuracy(classifier_name,y_test, y_pred):   
    #print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
    print (f"Accuracy of {classifier_name} ", accuracy_score(y_test,y_pred)*100)  
    #print("Report : ", classification_report(y_test, y_pred))

# Function to make predictions
def prediction(X_test, clf_object):
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    # returning predicted outputs
    return y_pred

# Prediction using gini
y_pred_gini = prediction(X_test, clf_gini)
# calculating efficieny of classifier generated with gini
cal_accuracy("Gini classifier",y_test, y_pred_gini)


# Prediction using entropy
y_pred_entropy = prediction(X_test, clf_entropy)
# calculating efficieny of classifier generate with entropy
cal_accuracy("Entropy classifier",y_test, y_pred_entropy)



Accuracy of Gini classifier  85.70611778158947
Accuracy of Entropy classifier  88.33619210977702


<h5>Save Trained Model</h5>

In [51]:
## Saving decision tree classifier gini 
joblib.dump(clf_gini, filename="decision_tree_gini_classifier.joblib")
## Saving decision tree classifier entropy
joblib.dump(clf_entropy, filename="decision_tree_entropy_classifier.joblib")

['decision_tree_entropy_classifier.joblib']

<h5>Custom data prediction test</h5>

In [10]:
customer_eligibility_test_data = [
    {
        "customerId" : "12345", 
        "purchase_by_invoice" : 2, 
        "purchase_total_amount" : 170,
        "purchase_streek" : 2,
        "purchase_total_product_quantity" : 10,
        "is_eligible_customer" : 1
    },
    {
        "customerId" : "12345", 
        "purchase_by_invoice" : 2, 
        "purchase_total_amount" : 170,
        "purchase_streek" : 2,
        "purchase_total_product_quantity" : 10,
        "is_eligible_customer" : 1
    },
    {
        "customerId" : "12345", 
        "purchase_by_invoice" : 2, 
        "purchase_total_amount" : 170,
        "purchase_streek" : 2,
        "purchase_total_product_quantity" : 10,
        "is_eligible_customer" : 1
    },
    {
        "customerId" : "12345", 
        "purchase_by_invoice" : 2, 
        "purchase_total_amount" : 170,
        "purchase_streek" : 2,
        "purchase_total_product_quantity" : 10,
        "is_eligible_customer" : 1
    }
]

test_df = pd.DataFrame(columns=['customerId','purchase_by_invoice','purchase_total_amount','purchase_streek','purchase_total_product_quantity','is_eligible_customer'], data=customer_eligibility_test_data)

test_df = test_df.drop(columns=['customerId'])

prediction(test_df, clf_entropy)

array([4, 4, 4, 4])

In [22]:
df = df_grouped_by_customer_id.get_group(18283.0)
price_spend_by_date = {}

for x in list(set(df['InvoiceDate'])):
   price_spend_by_date[x] = sum(list(df.loc[df['InvoiceDate'] == x, 'UnitPrice']))

price_spend_by_date





{datetime.date(2011, 1, 6): 100.94999999999996,
 datetime.date(2011, 4, 21): 82.82,
 datetime.date(2011, 6, 14): 49.029999999999994,
 datetime.date(2011, 7, 14): 92.75999999999999,
 datetime.date(2011, 11, 10): 105.65,
 datetime.date(2011, 11, 23): 128.02999999999992,
 datetime.date(2011, 2, 28): 70.55000000000003,
 datetime.date(2011, 9, 5): 65.89,
 datetime.date(2011, 11, 30): 85.07999999999997,
 datetime.date(2011, 1, 23): 83.25000000000001,
 datetime.date(2011, 10, 27): 83.48,
 datetime.date(2011, 12, 6): 65.37999999999997,
 datetime.date(2011, 6, 23): 132.82,
 datetime.date(2011, 5, 23): 75.23999999999998}

<h5>Training and generating classifoers (Random Forest classifier)</h5>

In [35]:
## Learning parameters
X = df_purchase_count_based_offer_eligibity[['purchase_by_invoice','purchase_total_amount','purchase_streek', 'purchase_total_product_quantity','is_eligible_customer']]
## offer_eligibility as target parameter
Y = df_purchase_count_based_offer_eligibity[['offer_eligibility']]

X = df_purchase_count_based_offer_eligibity.iloc[:,1:5].values   
Y = df_purchase_count_based_offer_eligibity.iloc[:,-1].values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)

model = RandomForestRegressor(n_estimators=500, random_state=42, min_samples_split=2, min_samples_leaf=1, max_depth=10, bootstrap=True)
model.fit(x_train, y_train)
predict = model.predict(x_test)
print(predict)

[6. 0. 0. ... 0. 0. 0.]


[6 0 0 ... 0 0 0]


In [37]:
print("Mean Absolute Error:", round(metrics.mean_absolute_error(y_test, predict), 4))
print("Mean Squared Error:", round(metrics.mean_squared_error(y_test, predict), 4))
print("Root Mean Squared Error:", round(np.sqrt(metrics.mean_squared_error(y_test, predict)), 4))
print("(R^2) Score:", round(metrics.r2_score(y_test, predict), 4))
print(f'Train Score : {model.score(x_train, y_train) * 100:.2f}% and Test Score : {model.score(x_test, y_test) * 100:.2f}% using Random Tree Regressor.')
errors = abs(predict - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.') 

Mean Absolute Error: 0.0785
Mean Squared Error: 1.8843
Root Mean Squared Error: 1.3727
(R^2) Score: 0.9706
Train Score : 99.88% and Test Score : 97.06% using Random Tree Regressor.
Accuracy: nan %.


  mape = 100 * (errors / y_test)
  mape = 100 * (errors / y_test)
