In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay,confusion_matrix
import pickle 
import random

pd.set_option('display.max_column', None)
pd.set_option('display.max_rows',100)
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('FinalDataFix.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,expenses,financial_status,experience,estimated_period,unwilling_toAccept,scenario,sum_scores,salary
0,0,2,4,2,3,2,2,4,3,97036796
1,1,3,3,2,4,3,4,4,4,84735152
2,2,4,3,4,6,4,1,3,4,5201451
3,3,3,4,4,5,4,1,1,4,22683827
4,4,4,1,1,2,2,4,4,3,3340289


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,expenses,financial_status,experience,estimated_period,unwilling_toAccept,scenario,sum_scores,salary
0,0,2,4,2,3,2,2,4,3,97036796
1,1,3,3,2,4,3,4,4,4,84735152
2,2,4,3,4,6,4,1,3,4,5201451
3,3,3,4,4,5,4,1,1,4,22683827
4,4,4,1,1,2,2,4,4,3,3340289


## **DATA CLEANING AND MANIPULATION**

In [5]:
df.drop(columns='Unnamed: 0',inplace=True)

### **CHANGING INTEGER BASED INPUT INTO STRING DESCRIPTIONS**

#### **AGE**

In [6]:
# df["age"] = (df["age"]
#             .replace(1,"Over 55 years")
#             .replace(2,"45-55 years")
#             .replace(3,"35-44 years")
#             .replace(4,"Under 35 years")
#             )

#### **EXPENSES**

In [7]:
# df["expenses"] = (df["expenses"]
#             .replace(1,"Over 75% of total income")
#             .replace(2,"Between 50– 75% of total income")
#             .replace(3,"Between 25– 50% of total income")
#             .replace(4,"Less than 25% of total income")
#             )

#### **FINANCIAL_STATUS**

In [8]:
# df["financial_status"] = (df["financial_status"]
#             .replace(1,"Less assets than debts")
#             .replace(2,"Assets equal to debts")
#             .replace(3,"More assets than debts")
#             .replace(4,"I will have adequate savings/investments throughout my retirement")
#             )

#### **EXPERIENCE**

In [9]:
# df["experience"] = (df["experience"]
#             .replace(1,"Bank Deposit")
#             .replace(2,"Government Bond or Government Bond Fund")
#             .replace(3,"Debentures or Fixed Income Fund")
#             .replace(4,"Equity Stock or Equity Fund or other High-Risk Assets")
#             )

#### **ESTIMATED_PERIOD**

In [10]:
# df["estimated_period"] = (df["estimated_period"]
#             .replace(1,"Less than 1 year")
#             .replace(2,"1 – 3 years")
#             .replace(3,"3 – 5 years")
#             .replace(4,"More than 5 years")
#             )

#### **UNWILLING_TOACCEPT**

In [11]:
# df["unwilling_toAccept"] = (df["unwilling_toAccept"]
#             .replace(1,"5% or less")
#             .replace(2,"Around 5% - 10%")
#             .replace(3,"Around 10% - 20%")
#             .replace(4,"Over 20%")
#             )

#### **SCENARIO**

In [12]:
# df["scenario"] = (df["scenario"]
#             .replace(1,"Scenario 1: returns of perhaps 2.5%, and 0% loss")
#             .replace(2,"Scenario 2: maximum returns of perhaps 7%, and 1% possible loss")
#             .replace(3,"Scenario 3: maximum returns of perhaps 15%, and 5% possible loss")
#             .replace(4,"Scenario 4: maximum returns of perhaps 25%, and 15% possible loss")
#             )

In [13]:
df.duplicated().any()

False

## **DATA SPLITTING**

In [14]:
#Splitting inference data
inference = df.sample(2)
inference

Unnamed: 0,age,expenses,financial_status,experience,estimated_period,unwilling_toAccept,scenario,sum_scores,salary
653,1,4,4,4,2,4,2,3,85372300
1863,2,3,4,5,1,2,3,3,36496303


In [15]:
#Dropping inferenced data
df = df.drop(inference.index, axis = 0)

In [16]:
X = df.drop('sum_scores',axis=1)
y = df['sum_scores']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [17]:
print(f'X_train shape : ',X_train.shape)
print(f'X_test shape : ',X_test.shape)
print(f'y_train shape : ',y_train.shape)
print(f'y_test shape : ',y_test.shape)

X_train shape :  (7498, 8)
X_test shape :  (2500, 8)
y_train shape :  (7498,)
y_test shape :  (2500,)


## **PREPROCESSING PIPELINE**

In [18]:
#Scaling on salary column
num_pipeline = Pipeline([
    ('scaler', StandardScaler())])

num_tr = num_pipeline.fit_transform(X_train[['salary']])

In [19]:
# # Ordinal encoding for the rest
# age_ord = ['Over 55 years', '45-55 years', '35-44 years', 'Under 35 years']
# expenses_ord = ['Over 75% of total income', 'Between 50– 75% of total income', 'Between 25– 50% of total income', 'Less than 25% of total income']
# financial_ord = ['Less assets than debts', 'Assets equal to debts','More assets than debts', 'I will have adequate savings/investments throughout my retirement']
# experience_ord = ['Bank Deposit', 'Government Bond or Government Bond Fund', 'Debentures or Fixed Income Fund', 'Equity Stock or Equity Fund or other High-Risk Assets']
# estimatedPeriod_ord = ['Less than 1 year', '1 – 3 years', '3 – 5 years', 'More than 5 years']
# unwillingToAccept_ord = ['5% or less', 'Around 5% - 10%', 'Around 10% - 20%', 'Over 20%']
# scenario_ord = ['Scenario 1: returns of perhaps 2.5%, and 0% loss', 'Scenario 2: maximum returns of perhaps 7%, and 1% possible loss', 'Scenario 3: maximum returns of perhaps 15%, and 5% possible loss', 'Scenario 4: maximum returns of perhaps 25%, and 15% possible loss']

# cat_pipeline = Pipeline([
#         ('ordinal_enc', OrdinalEncoder(categories=[age_ord, expenses_ord, financial_ord, experience_ord, estimatedPeriod_ord, unwillingToAccept_ord, scenario_ord]))
#     ])

# cat_tr = cat_pipeline.fit_transform(X_train[['age','expenses','financial_status','experience','estimated_period','unwilling_toAccept','scenario']])

In [20]:
#Creating a column transformer to transform salary columns
full_pipeline = ColumnTransformer([
        # ("cat", cat_pipeline, ['age','expenses','financial_status','experience','estimated_period','unwilling_toAccept','scenario']),
        ("num", num_pipeline, ['salary'])
    ], remainder='passthrough')

train_tr = full_pipeline.fit_transform(X_train)
train_tr

array([[-1.29568119,  1.        ,  4.        , ...,  4.        ,
         3.        ,  4.        ],
       [ 0.41634517,  4.        ,  4.        , ...,  4.        ,
         4.        ,  4.        ],
       [-0.1548749 ,  4.        ,  4.        , ...,  1.        ,
         3.        ,  4.        ],
       ...,
       [-0.67397584,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [-1.07508538,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 0.83928499,  2.        ,  3.        , ...,  3.        ,
         2.        ,  3.        ]])

In [21]:
test_tr = full_pipeline.transform(X_test)
test_tr

array([[-0.63262708,  1.        ,  2.        , ...,  4.        ,
         1.        ,  1.        ],
       [-0.15248366,  3.        ,  4.        , ...,  4.        ,
         1.        ,  2.        ],
       [ 0.07709473,  3.        ,  2.        , ...,  2.        ,
         3.        ,  1.        ],
       ...,
       [ 0.28767299,  1.        ,  1.        , ...,  1.        ,
         1.        ,  1.        ],
       [-0.42747737,  1.        ,  3.        , ...,  4.        ,
         4.        ,  3.        ],
       [ 1.38195777,  1.        ,  2.        , ...,  2.        ,
         1.        ,  2.        ]])

## **FULL PIPELINE**

In [22]:
#Initiating SVC Model
svc = SVC()

In [23]:
#Initiating all process pipeline
all_process = Pipeline([
    ("preprocessing", full_pipeline),
    ("svc", svc )
])

In [24]:
all_process.fit(X_train, y_train)

In [25]:
train_result = all_process.predict(X_train)
test_result = all_process.predict(X_test)

In [26]:
print("=="*20,"TRAINING","=="*20)
print(classification_report(y_train,train_result,zero_division=0))
print("=="*20,"TEST","=="*20)
print(classification_report(y_test,test_result,zero_division=0))

              precision    recall  f1-score   support

           1       1.00      0.99      0.99       942
           2       0.99      1.00      0.99       749
           3       1.00      1.00      1.00      3415
           4       0.97      1.00      0.99      1663
           5       1.00      0.94      0.97       729

    accuracy                           0.99      7498
   macro avg       0.99      0.98      0.99      7498
weighted avg       0.99      0.99      0.99      7498

              precision    recall  f1-score   support

           1       1.00      0.99      0.99       277
           2       0.99      0.99      0.99       266
           3       1.00      1.00      1.00      1148
           4       0.96      1.00      0.98       545
           5       1.00      0.92      0.96       264

    accuracy                           0.99      2500
   macro avg       0.99      0.98      0.98      2500
weighted avg       0.99      0.99      0.99      2500



## **INFERENCE**

In [27]:
inference = inference[['salary','age','expenses','financial_status','experience','estimated_period','unwilling_toAccept','scenario']]

In [28]:
# full_pipeline.transform(inference)
all_process.predict(inference)

array([3, 3])

In [29]:
# save pipeline
pickle.dump(full_pipeline, open("full_model.pkl", "wb"))
# save model
pickle.dump(svc, open('model.pkl', 'wb'))

#### **TESTING THE SAVED MODEL**

In [30]:
model = pickle.load(open("full_model.pkl", "rb"))

In [31]:
inf = inference.head(1)
inf

Unnamed: 0,salary,age,expenses,financial_status,experience,estimated_period,unwilling_toAccept,scenario
653,85372300,1,4,4,4,2,4,2


In [32]:
# pipeline model predict input data
Category = model.predict(inf).tolist()[0]
Category

AttributeError: 'ColumnTransformer' object has no attribute 'predict'