# Needs-based reccomendation system
### Supervised Learning problem whcih accounts for knowledge of financial products and customers needs.

In [53]:
# imports

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [36]:
# data loading and merging

clients = pd.read_excel('Needs.xlsx')
products = pd.read_excel('Products.xlsx')

clients['key'] = 1
products['key'] = 1
df = pd.merge(clients, products, on='key')
df.drop('key', axis=1, inplace=True)
clients.drop('key', axis=1, inplace=True)
products.drop('key', axis=1, inplace=True)

# Inferring labels from data
### A product is suggested to a client (flag = 1, 0 otherwise) if it satisfies the following conditions:
### 1. the income/accumulation propension of the client and the type of product match;
### 2. the synthetic risk indicator of the product is less than or equal to the risk propensity of the client.

In [37]:
# add the flag variable

df['flag'] = 0

for index, row in df.iterrows():
    if ((row['IncomeInvestment'] == 1 and row['Type'] == 0) or (row['AccumulationInvestment'] == 1 and row['Type'] == 1)) and row['RiskPropensity'] >= row['Risk']:
        df.at[index, 'flag'] = 1

# Logistic Regression
### Train set has 80% of the data, test set has 20% of the data.

In [38]:
# data split (train and test sets)

X = df.drop(['ID', 'IDProduct', 'flag'], axis=1)
y = df['flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
# model optimization (grid search for parameters)

param_grid = {'C': [0.005, 0.01, 0.015, 0.02, 0.025]}
grid_search = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [40]:
# evaluation of the best model

y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)

print(f"Best Model Parameters: {best_params} \nBest Model Accuracy: {accuracy_best}")

Best Model Parameters: {'C': 0.015} 
Best Model Accuracy: 0.8949090909090909


In [41]:
# model training

model = LogisticRegression(C=0.015, max_iter=10000)
model.fit(X_train, y_train)

In [42]:
# model evaluation

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))

Accuracy: 0.8949090909090909
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      8845
           1       0.87      0.54      0.67      2155

    accuracy                           0.89     11000
   macro avg       0.88      0.76      0.80     11000
weighted avg       0.89      0.89      0.89     11000



# Clients evaluation

In [57]:
# model for clients suggestions

def suggestions(age, gender, family_members, financial_education, risk_propensity, income, wealth, income_investment, accumulation_investment):
    
        client = {'Age': [age],
                'Gender': [gender],
                'FamilyMembers': [family_members],
                'FinancialEducation': [financial_education],
                'RiskPropensity': [risk_propensity],
                'Income': [income],
                'Wealth': [wealth],
                'IncomeInvestment': [income_investment],
                'AccumulationInvestment': [accumulation_investment]}

        client = pd.DataFrame(client)

        client['key'] = 1
        products['key'] = 1
        data = pd.merge(client, products, on='key')
        data.drop('key', axis=1, inplace=True)
        client.drop('key', axis=1, inplace=True)
        products.drop('key', axis=1, inplace=True)

        s = model.predict(data.drop('IDProduct', axis=1))

        S = pd.DataFrame({'IDProduct': products['IDProduct'], 'flag': s})

        return S

In [58]:
# new client suggestion

age = 24
gender = 0
family_members = 4
financial_education = 0.8
risk_propensity = 0.45
income = 10
wealth = 3
income_investment = 0
accumulation_investment = 1

suggestions(age, gender, family_members, financial_education, risk_propensity, income, wealth, income_investment, accumulation_investment)

Unnamed: 0,IDProduct,flag
0,1,0
1,2,0
2,3,1
3,4,0
4,5,0
5,6,0
6,7,0
7,8,0
8,9,0
9,10,0


# Next steps
### Many other models could be performed; moreover, the following things could be improved:
### 1. the complete-dataframe creation (it makes all the following algorithms uselessly long);
### 2. the way flags are created (more features should be involved);
### 3. the logistic regression variables choice (transformations and possibly removals).