In [1]:

import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.

if os.environ.get('RUNTIME_ENV_LOCATION_TYPE') == 'external':
    endpoint_9e53905096384939a3b02eee24cc81a3 = 'https://s3.us.cloud-object-storage.appdomain.cloud'
else:
    endpoint_9e53905096384939a3b02eee24cc81a3 = 'https://s3.private.us.cloud-object-storage.appdomain.cloud'

client_9e53905096384939a3b02eee24cc81a3 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='Your api key',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url=endpoint_9e53905096384939a3b02eee24cc81a3)

body = client_9e53905096384939a3b02eee24cc81a3.get_object(Bucket='jupiterenergypaymentplancampaign-donotdelete-pr-4mtst9zqeqmfqs',Key='product_recommendation.csv.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

product_recommendation_data = pd.read_csv(body)
product_recommendation_data.head()

Index(['username', 'community_id', 'location', 'carbon_credit',
       'carbon_medals', 'comment_products', 'product_id', 'farm_id',
       'plantername', 'product_name', 'category', 'price',
       'carbon_credit_needed', 'number', 'sale_number', 'identifications',
       'carbon_emission', 'donate_amount', 'create_time', 'modify_time',
       'recommend'],
      dtype='object')


In [2]:
import pandas.api.types as tp
import numpy as np

'''
only select province/state from location data
'''
def preprocessing_location(csv_data):
    for i, row in csv_data.iterrows():
        user_location = row['location']
        state = user_location.split('/')[-1]
        csv_data.at[i, 'location'] =  state
    return csv_data

'''
count all category values for a column 
'''
def aggregate_data(csv_data, header):
    res = set()
    for i, row in csv_data.iterrows():
        data = row[header]
        if type(data) == float and pd.isna(data):
            continue
        category = data.split('/')
        for c in category:
            res.add(c)
        row[header] = category
    res = sorted(list(set(res)))
    return res, csv_data

'''
count location value frequency
'''
def aggregate_location(csv_data, header):
    res = {}
    count = 0
    for i, row in csv_data.iterrows():
        data = row[header]
        if data not in res.keys():
            res[data] = count
            count += 1
    return res

'''
only select the first six number for date time
'''
def preprocess_date(csv_data):
    for i, row in csv_data.iterrows():
        data = row['create_time']
        modify_time = row['modify_time']
        data = data[0:8]
        modify_time = modify_time[0:8]
        csv_data.at[i, 'create_time'] = data
        csv_data.at[i, 'modify_time'] = modify_time
    return csv_data

In [3]:

csv_data = preprocessing_location(product_recommendation_data)
location_dictionary = aggregate_location(csv_data, 'location')
community, csv_data = aggregate_data(csv_data, 'community_id')
comment_product, csv_data = aggregate_data(csv_data, 'comment_products')
medals, csv_data = aggregate_data(csv_data, 'carbon_medals')
products, csv_data = aggregate_data(csv_data, 'product_name')
category_dictionary = aggregate_location(csv_data, 'category')
identifications, csv_data = aggregate_data(csv_data, 'identifications')
csv_data = preprocess_date(csv_data)
product_dictionary = aggregate_location(csv_data, 'product_name')

In [4]:
'''
one_hot_encoding algorithm
'''
def one_hot_encoding(list_data, aggregate_data):
    encoding = ""
    if type(list_data) == float and pd.isna(list_data):
        return encoding
    for element in aggregate_data:
        if element in list_data:
            encoding = encoding + '1'
        else:
            encoding = encoding + '0'
    return int(encoding)

'''
apply one-hot-encoding to given column
'''
def apply_encoding(csv_data, header, aggregate):
    for i, row in csv_data.iterrows():
        raw = row[header]
        encoding = one_hot_encoding(raw, aggregate)
        csv_data.at[i, header] = encoding
    return csv_data

'''
apply one-hot-encoding to location column
'''
def apply_location_encoding(csv_data):
    for i, row in csv_data.iterrows():
        location = row['location']
        csv_data.at[i, 'location'] = location_dictionary[location]
    return csv_data

'''
apply one-hot-encoding to category column
'''
def apply_category_encoding(csv_data):
    for i, row in csv_data.iterrows():
        category = row['category']
        csv_data.at[i, 'category'] = category_dictionary[category]
    return csv_data

'''
standardize numerical variable
'''
def standardlize(csv_data, header):
    mean = csv_data[header].mean()
    sd = csv_data[header].std()
    for i, row in csv_data.iterrows():
        rent = row[header]
        standard = (rent - mean) / sd
        csv_data.at[i, header] = standard
    return csv_data

'''
fill missing values with mode
'''
def fill_with_mode(csv_data, community_mode, medals_mode, comment_mode, location_mode):
    for i, row in csv_data.iterrows():
        if row['community_id'] == '':
            csv_data.at[i, 'community_id'] = community_mode
        if row['carbon_medals'] == '':
            csv_data.at[i, 'carbon_medals'] = medals_mode
        if row['comment_products'] == '':
            csv_data.at[i, 'comment_products'] = comment_mode
        if row['location'] == '':
            csv_data.at[i, 'location'] = location_mode
    return csv_data

'''
apply one-hot-encoding to product column
'''
def apply_identification_encoding(csv_data):
    for i, row in csv_data.iterrows():
        category = row['identifications']
        csv_data.at[i, 'identifications'] = identification_dictionary[category]
    return csv_data

'''
apply one-hot-encoding to product column
'''
def apply_product_encoding(csv_data):
    for i, row in csv_data.iterrows():
        category = row['product_name']
        csv_data.at[i, 'product_name'] = product_dictionary[category]
    return csv_data

In [5]:
csv_data = apply_encoding(csv_data, 'community_id', community) 
csv_data = apply_encoding(csv_data, 'carbon_medals', medals)
csv_data = apply_encoding(csv_data, 'comment_products', comment_product)
csv_data = apply_encoding(csv_data, 'identifications', identifications)
csv_data = apply_location_encoding(csv_data)
csv_data = apply_category_encoding(csv_data)
csv_data = apply_product_encoding(csv_data)

In [7]:
community_mode = 0
medals_mode = 0
comment_mode = 0
location_mode = csv_data['location'].mode()
csv_data = fill_with_mode(csv_data, community_mode, medals_mode, comment_mode, location_mode)
csv_data['community_id'] = csv_data['community_id'].astype(int)
csv_data['location'] = csv_data['location'].astype(int)
csv_data['comment_products'] = csv_data['comment_products'].astype(int)
csv_data['carbon_medals'] = csv_data['carbon_medals'].astype(int)
csv_data['create_time'] = csv_data['create_time'].astype(int)
csv_data['modify_time'] = csv_data['modify_time'].astype(int)

csv_data['product_name'] = csv_data['product_name'].astype(int)
csv_data['category'] = csv_data['category'].astype(int)
csv_data['identifications'] = csv_data['identifications'].astype(int)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
def train_model(csv_data, test_size, random_state):
    Y = csv_data['recommend']
    csv_data = csv_data.drop(['recommend', 'username', 'plantername'], axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(csv_data, Y, test_size = test_size, random_state = random_state)
    model = XGBClassifier(objective = 'binary:logitraw')
    model.fit(X_train, y_train)
    return model, X_train
model, X_train = train_model(csv_data, 0.33, 7)

In [23]:
wml_credentials = {
    "apikey": 'Your api key',
    "url": "https://us-south.ml.cloud.ibm.com"
    }

In [11]:
# create client to access our WML service
from ibm_watson_machine_learning import APIClient
def save_model(model):
    wml_credentials = {
    "apikey": 'Your api key',
    "url": "https://us-south.ml.cloud.ibm.com"
    }
    client = APIClient(wml_credentials)
    software_spec_uid = client.software_specifications.get_id_by_name("default_py3.7")
    space_id = 'Your space id'
    client.set.default_space(space_id)
    metadata = {
                client.repository.ModelMetaNames.NAME: 'Gradient Boosting model to predict product recommendation score',
                client.repository.ModelMetaNames.TYPE: 'scikit-learn_0.23',
                client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid
    }
    published_model = client.repository.store_model(
        model=model,
        meta_props=metadata)
    return published_model

In [None]:
published_model = save_model(model)

In [31]:
'''
aggregate on product and user purchase relationship
'''
def get_user_farm_dict(gf_rent):
    user_product_dic = {}
    product_dic = {}
    product_sequence = 0
    for i, row in gf_rent.iterrows():
        product = row['product_id']
        user = row['username']
        if user not in user_product_dic.keys():
            user_product_dic[user] = []
        user_product_dic[user].append(product)
        if product not in product_dic.keys():
            product_dic[product] = product_sequence
            product_sequence = product_sequence + 1
    return user_product_dic, product_dic

def get_user_matrix(user_product_dic, gf_order_data, products):
    num_user = len(user_product_dic.keys())
    num_product = len(products.keys())
    user_sequence = 0
    user_dic = {}
    user_product_matrix = np.ndarray(shape = (num_user, num_product), dtype = int)
    user_product_matrix.fill(0)
    for user in user_product_dic.keys():
        for f in user_product_dic[user]:
            product_number = products[f]
            user_product_matrix[user_sequence][product_number] = 1
        user_dic[user] = user_sequence
        user_sequence = user_sequence + 1
    return user_product_matrix, user_dic

'''
calculate cosine similarity
'''
def calculate_similarity(user_product_matrix):
    return sklearn.metrics.pairwise.cosine_similarity(user_product_matrix)

'''
calculate user similarity
'''
def get_similar_user(given_user, user_product_matrix, user_dic):
    user = user_dic[given_user]
    user_matrix = user_product_matrix[user]
    sim_dic = {}
    similarity_score = calculate_similarity(user_product_matrix)[user]
    for i in range(0, len(similarity_score)):
        if i != user:
            score = similarity_score[i]
            if score not in sim_dic:
                sim_dic[score] = []
            sim_dic[score].append(i)
    sorted_similarity_score = sorted(sim_dic.keys(), reverse = True)
    similar_user = []
    for i in range(0, min(3, len(sorted_similarity_score))):
        score = sorted_similarity_score[i]
        for j in range(0, len(sim_dic[score])):
            similar_user.append(sim_dic[score][j])
    return similar_user

'''
recall set of products
'''
def get_recall_set(user_farm_matrix, similar_user, given_user, user_dic):
    product_not_buy = []
    user_number = user_dic[given_user]
    for user in similar_user:
        user_matrix = user_farm_matrix[user]
        for i in range(0, len(user_matrix)):
            purchase = user_matrix[i]
            user_purchase= user_farm_matrix[user_number][i]
            if user_purchase == 0 and purchase == 1:
                product_not_buy.append(i)
    return product_not_buy 
'''
check if the given user has purchase history
'''
def check_user_purchase(gf_order, given_user):
    user_order = gf_order[gf_order['username'] == given_user]
    #need to check if the cell is NA/null/empty string
    if not user_order['order_id'].empty:
        return True
    return False

In [73]:

body = client_9e53905096384939a3b02eee24cc81a3.get_object(Bucket='jupiterenergypaymentplancampaign-donotdelete-pr-4mtst9zqeqmfqs',Key='gf_order.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

gf_order = pd.read_csv(body)
gf_order.head()


Unnamed: 0,order_id,username,product_id,address,money,carbon_credit,create_time,modify_time,remark
0,1234,feho,15,New York/NY,200,121,2021062811,2021062813,
1,5678,kct,52,San Jose/CA,600,200,2021062812,2021062816,
2,8372,beq,55,Paris/France,200,60,2021062812,2021062816,
3,8373,beq,15,Paris/France,100,100,2021062818,2021062820,


In [74]:
import sklearn
user_product_dic, products = get_user_farm_dict(gf_order)
user_product_matrix, user_dic = get_user_matrix(user_product_dic, gf_order, products)
similar_user = get_similar_user('feho', user_product_matrix, user_dic)
product_not_purchase = get_recall_set(user_product_matrix, similar_user, 'feho', user_dic)

pd_index = []
for index in product_not_purchase:
    pd_index.append(gf_order.iloc[index])
product_predict_set = pd.DataFrame(pd_index, columns = gf_order.columns)

In [75]:
def product_id_index(gf_order):
    dic = {}
    for i, row in gf_order.iterrows():
        dic[i] = row['product_id']
    return dic
product_id_index = product_id_index(gf_order)

In [76]:
test_user = {'username': 'aaa', 'community_id': 'c1/c2', 'location': 'New York/NY', 'carbon_credit': '589', 'carbon_medals': 'm1/m2', 'comment_products': 'p1'}
test_user = pd.DataFrame(test_user, index = [0])

def change_index(csv_data1, product_not_purchase) :
    csv_data = csv_data1.rename(mapper = {csv_data1.index.values[0]: product_not_purchase}, axis = 0)
    return csv_data

In [77]:
index = product_not_purchase[0]
csv_data1 = change_index(test_user, index)
product = pd.DataFrame([product_predict_set.loc[index]], columns = gf_order.columns)
product_id = product['product_id']
recommendation = csv_data.loc[csv_data['product_id'] == int(product_id), 'product_id']
df = csv_data[csv_data['product_id'].isin(recommendation)]
df = change_index(df, index)
df = df.drop(['username', 'community_id', 'location', 'carbon_credit', 'carbon_medals', 'comment_products'], axis = 1)
concate_rows = pd.concat([csv_data1, df], axis = 1)

In [79]:
csv_data1 = preprocessing_location(concate_rows)
location_dictionary = aggregate_location(csv_data1, 'location')
community, csv_data1 = aggregate_data(csv_data1, 'community_id')
comment_product, csv_data1 = aggregate_data(csv_data1, 'comment_products')
medals, csv_data1 = aggregate_data(csv_data1, 'carbon_medals')
category_dictionary = aggregate_location(csv_data1, 'category')

In [81]:
csv_data1 = apply_encoding(csv_data1, 'community_id', community) 
csv_data1 = apply_encoding(csv_data1, 'carbon_medals', medals)
csv_data1 = apply_encoding(csv_data1, 'comment_products', comment_product)
csv_data1 = apply_location_encoding(csv_data1)

In [83]:
def int_type(csv_data):
    community_mode = 0
    medals_mode = 0
    comment_mode = 0
    location_mode = csv_data['location'].mode()
    csv_data = fill_with_mode(csv_data, community_mode, medals_mode, comment_mode, location_mode)
    csv_data['community_id'] = csv_data['community_id'].astype(int)
    csv_data['location'] = csv_data['location'].astype(int)
    csv_data['comment_products'] = csv_data['comment_products'].astype(int)
    csv_data['carbon_medals'] = csv_data['carbon_medals'].astype(int)
    csv_data['carbon_credit'] = csv_data['carbon_credit'].astype(int)
    csv_data['create_time'] = csv_data['create_time'].astype(int)
    csv_data['modify_time'] = csv_data['modify_time'].astype(int)

    csv_data['product_name'] = csv_data['product_name'].astype(int)
    csv_data['category'] = csv_data['category'].astype(int)
    csv_data['identifications'] = csv_data['identifications'].astype(int)
    return csv_data

In [84]:
csv_data1 = int_type(csv_data1)

In [85]:
csv_data1 = csv_data1.drop(['username', 'plantername', 'recommend'], axis = 1)

In [86]:
prediction = model.predict_proba(csv_data1)


[[ 1.1496055  -0.14960556]]


In [87]:
def sort_farm(prediction, farm_not_rent):
    farm_score = {}
    for i in range(0, len(prediction)):
        pred_score = prediction[i]
        if pred_score[1] > pred_score[0]:
            farm_score[farm_not_rent[i]] = pred_score[1]
    sorted_farm_score = [key for key, value in sorted(farm_score.items(), key=lambda i: i[1], reverse=True)]
    return sorted_farm_score

sorted_farm_score = sort_farm(prediction, product_not_purchase)
def farm_recommendation(sorted_farm_score):
    count = 0
    recommended_farm = []
    for farm in sorted_farm_score:
        if count < 2:
            recommended_farm.append(product_id_index[farm])
            count += 1
    return recommended_farm
def feature():
    im=pd.DataFrame({'importance':model.feature_importances_})
    im=im.sort_values(by='importance',ascending=False)
    feature_index = im.index[0:3]
    features = [X_train.columns[x] for x in feature_index]
    return features