<h4>Required Libs</h4>

In [1]:
import pandas as pd 
import numpy as np
from datetime import datetime 
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import mean_absolute_error, mean_squared_error , r2_score ,accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import joblib
import json

<h4>Data Cleaning and processing</h4>

In [2]:
# Reading CSV
df = pd.read_csv('raw_data.csv',encoding = 'unicode_escape')
# Replacing empty CustomerID with nan
df['CustomerID'].replace('', np.nan, inplace=True)
# Replacing empty Description with nan
df['Description'].replace('', np.nan, inplace=True)
# droping all rows with NA
df.dropna()
# formatting date
df['InvoiceDate'] = df['InvoiceDate'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M:%S').date())
# Displaying df
df.head()

  df = pd.read_csv('raw_data.csv',encoding = 'unicode_escape')


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,discount
0,0,0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6.0,2022-04-16,2.55,17850.0,United Kingdom,0.7
1,1,1,536365,71053,WHITE METAL LANTERN,6.0,2022-07-10,3.39,17850.0,United Kingdom,1.0
2,2,2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8.0,2023-04-26,2.75,17850.0,United Kingdom,0.5
3,3,3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6.0,2022-08-22,3.39,17850.0,United Kingdom,0.8
4,4,4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6.0,2022-04-03,3.39,17850.0,United Kingdom,0.9


<h5>Data fomatting methods</h5>

In [3]:
def formatData(df):
    
  customer_eligibility_data = [] 
 
  for name, group in df:
    invoiceDates = list(set(group['InvoiceDate']))
    purchase_streek = 0

    for prevDate, currentDate in zip(invoiceDates, invoiceDates[1:]):
       invoice_date_diff = currentDate - prevDate 
       if(invoice_date_diff.days < 30):
            purchase_streek += 1

    purchase_total_amount = int(abs(sum(group['UnitPrice'])))


    customer_eligibility_data.append({ 
        "customerId" : name, 
        "description" : list(set(group['Description']))[0],
        "purchase_by_invoice" : len(set(group['InvoiceNo'])), 
        "purchase_total_amount" : purchase_total_amount, # add int
        "purchase_streek" : purchase_streek,
        "next_streek_purchase_amount" : (1 + purchase_streek) * 100,
        "remaining_amount" : abs(((1+purchase_streek) * 100) - purchase_total_amount),
        "is_eligible_customer" : 1 if (purchase_streek > 0 and int(abs(sum(group['UnitPrice'])) > 150)) else 0,
        "offer_eligibility" : sum(group['discount']) if(int(abs(sum(group['UnitPrice']))) < 150 and sum(group['discount']) > 10) else (purchase_streek * 2)
    })

  customer_eligibility_data += json.loads(pd.read_csv('combiner_data.csv').to_json(orient="records"))
  
  col = ['customerId','description','purchase_by_invoice','purchase_total_amount',
         'purchase_streek','next_streek_purchase_amount','remaining_amount','is_eligible_customer','offer_eligibility'
        ]

  df = pd.DataFrame(columns=col, data=customer_eligibility_data)
    
  return df

<h5>Data Formatting</h5>

In [4]:
df_grouped_by_customer_id = df.groupby('CustomerID')

df_purchase_count_based_offer_eligibity = formatData(df_grouped_by_customer_id)

display(df_purchase_count_based_offer_eligibity)

Unnamed: 0,customerId,description,purchase_by_invoice,purchase_total_amount,purchase_streek,next_streek_purchase_amount,remaining_amount,is_eligible_customer,offer_eligibility
0,12346.0,MEDIUM CERAMIC TOP STORAGE JAR,2,2,1,200,198,0,2.0
1,12347.0,BOOM BOX SPEAKER BOYS,7,481,83,8400,7919,1,166.0
2,12348.0,SWEETIES STICKERS,4,178,15,1600,1422,1,30.0
3,12349.0,POSTAGE,1,605,38,3900,3295,1,76.0
4,12350.0,POSTAGE,1,65,6,700,635,0,12.0
...,...,...,...,...,...,...,...,...,...
14372,11001.0,DOORMAT NEW ENGLAND,4,959,18,1900,941,1,36.0
14373,11002.0,JAM MAKING SET WITH JARS,3,290,3,400,110,1,6.0
14374,11003.0,RED COAT RACK PARIS FASHION,7,1504,15,1600,96,1,30.0
14375,11004.0,YELLOW COAT RACK PARIS FASHION,2,995,18,1900,905,1,36.0


<h5>Segregating Feature and Target parameters</h5>

In [5]:
## Learning parameters
feature_variables = df_purchase_count_based_offer_eligibity[['purchase_total_amount','purchase_streek','is_eligible_customer']]
## offer_eligibility as target parameter
target_variables = df_purchase_count_based_offer_eligibity[['offer_eligibility']]

<h5>Methods to Train / Generate / Predict and Determine Accuracy of classifiers (Decision Tree Classifier)</h5>

In [6]:
# Method to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=3, min_samples_leaf=5)
    # Performing training
    clf_gini.fit(X_train, y_train)
    # returning model
    return clf_gini
      
# Method to perform training with entropy and generate classifier model
def train_using_entropy(X_train, X_test, y_train):
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth = 3, min_samples_leaf = 5)
    # Performing training
    clf_entropy.fit(X_train, y_train)
    # returning model
    return clf_entropy

# Method to perform training with random forest classifier and generate model
def train_using_random_forest(x_train, x_test, y_train):
    model = RandomForestRegressor(n_estimators=500, random_state=42, min_samples_split=2, min_samples_leaf=1, max_depth=10, bootstrap=True)
    model.fit(x_train, y_train)
    return model

# Method to calculate accuracy of decision tree classifiers
def cal_accuracy_of_dt_classifier(classifier_name,y_test, y_pred):   
    #print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
    print(f"\n========================= {classifier_name} ==================================\n")
    print (f"Accuracy of {classifier_name} ", accuracy_score(y_test,y_pred)*100)  
    #print("Report : ", classification_report(y_test, y_pred))

# Method to calculate accuracy of random forest classifiers
def cal_accuracy_of_rf_classifier(classifier_name, x_test, x_train, y_test, y_pred, model):
    print(f"\n============================= {classifier_name} ==============================\n")
    print(f"Mean Absolute Error of {classifier_name}:", round(mean_absolute_error(y_test, y_pred), 4))
    print(f"Mean Squared Error of {classifier_name}:", round(mean_squared_error(y_test, y_pred), 4))
    print(f"Root Mean Squared Error of {classifier_name}:", round(np.sqrt(mean_squared_error(y_test, y_pred)), 4))
    print(f"(R^2) Score of {classifier_name}:", round(r2_score(y_test, y_pred), 4))
    print(f'Train Score of {classifier_name}: {model.score(x_train, y_train) * 100:.2f}% and Test Score : {model.score(x_test, y_test) * 100:.2f}% using Random Tree Regressor.')

# Method to make predictions
def prediction(X_test, clf_object):
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    # returning predicted outputs
    return y_pred 

In [7]:
## Spliting dataset into training and test data with test data size as 0.4 and training data size as 0.6
x_train, x_test, y_train, y_test = train_test_split(feature_variables, target_variables, test_size=0.5, random_state=0)
# create classifier
clr_gini = train_using_gini(x_train, x_test, y_train)
y_pred = prediction(x_test, clr_gini)
cal_accuracy_of_dt_classifier("Gini classifier", y_test, y_pred)


ValueError: Unknown label type: 'continuous'

<h5>Custom data prediction test</h5>

In [8]:
customer_eligibility_test_data = [
    {
        "customerId" : "12345", 
        "purchase_by_invoice" : 50, 
        "purchase_total_amount" : 5000,
        "purchase_streek" : 100,
        "purchase_total_product_quantity" : 150,
        "is_eligible_customer" : 1
    }
]

test_df = pd.DataFrame(columns=['customerId','purchase_by_invoice','purchase_total_amount','purchase_streek','purchase_total_product_quantity','is_eligible_customer'], data=customer_eligibility_test_data)

test_df = test_df.drop(columns=['customerId'])

#test_df = scale.transform(test_df.values)

prediction(test_df, clf_entropy)

NameError: name 'clf_entropy' is not defined

In [9]:
## Spliting dataset into training and test data with test data size as 0.4 and training data size as 0.6
x_train, x_test, y_train, y_test = train_test_split(feature_variables.values, target_variables.values, test_size=0.5, random_state=0)

## Standardizing input
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)

joblib.dump(scale, 'std_scaler.bin', compress=True)

## Generating classifier model with random forest
clf_rf = train_using_random_forest(x_train, x_test, y_train)
# Prediction using random forest
y_pred_rf = prediction(x_test, clf_rf)

# calculating efficieny of classifier generate with random forest
cal_accuracy_of_rf_classifier("Random Forest classifier", x_test, x_train, y_test, y_pred_rf, clf_rf)
## Saving decision tree classifier entropy 
joblib.dump(clf_rf, filename="random_forest_classifier.joblib")

  model.fit(x_train, y_train)




Mean Absolute Error of Random Forest classifier: 0.3172
Mean Squared Error of Random Forest classifier: 1.63
Root Mean Squared Error of Random Forest classifier: 1.2767
(R^2) Score of Random Forest classifier: 0.9994
Train Score of Random Forest classifier: 99.97% and Test Score : 99.94% using Random Tree Regressor.


['random_forest_classifier.joblib']

In [10]:
scale = joblib.load('std_scaler.bin')

customer_eligibility_test_data = [
    {
        "customerId" : "12345", 
        "purchase_by_invoice" : 18, 
        "purchase_total_amount" : 400,
        "purchase_streek" : 23,
        "purchase_total_product_quantity" : 10,
        "is_eligible_customer" : 1
    }
]

test_df = pd.DataFrame(columns=['customerId','purchase_by_invoice','purchase_total_amount','purchase_streek','purchase_total_product_quantity','is_eligible_customer'], data=customer_eligibility_test_data)

test_df = test_df.drop(columns=['customerId'])

test_df = scale.transform(test_df.values)

prediction(test_df, clf_rf)

ValueError: X has 5 features, but StandardScaler is expecting 3 features as input.