# Import 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
import warnings

warnings.filterwarnings("ignore")
 

# EDA, Data Cleaning and Data Preprocessing

In [3]:
df = pd.read_csv("salary.csv")
df.head()


df.describe()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  5000 non-null   int64 
 1   Gender               5000 non-null   object
 2   Education            5000 non-null   object
 3   Marital_Status       5000 non-null   object
 4   Job_Type             5000 non-null   object
 5   Region               5000 non-null   object
 6   Income               5000 non-null   int64 
 7   Expenses             5000 non-null   int64 
 8   Savings              5000 non-null   int64 
 9   Credit_Score         5000 non-null   int64 
 10  Years_of_Experience  5000 non-null   int64 
 11  Loan_Amount          5000 non-null   int64 
 12  Loan_Approval        5000 non-null   int64 
 13  Default_Risk         5000 non-null   int64 
 14  Future_Income        5000 non-null   int64 
dtypes: int64(10), object(5)
memory usage: 586.1+ KB


In [4]:
df2 = df[(df["Income"]<0) |( df["Expenses"]<0) | (df["Future_Income"]<0)]
df.drop(df2.index,inplace=True)


In [5]:
# for col in num_cloumn:
#     plt.figure(figsize=(3,2))
#     sns.histplot(x=df[col],kde=True)

In [6]:
# for col in cat_column:
#     plt.figure(figsize=(3,2))
#     sns.boxplot(x=df[col])

In [7]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=["Future_Income"],axis=1)
y = df["Future_Income"]

target = "Future_Income"
num_columns = X.select_dtypes(include=["int64", "float64"]).columns.to_list()
cat_columns = X.select_dtypes(include=["object"]).columns.to_list()

In [8]:

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [9]:
num_transformation = StandardScaler()
cat_transformation = OneHotEncoder(handle_unknown="ignore")

preprocess_st_on = ColumnTransformer(transformers=[
    ('num',num_transformation,num_columns),
    ('cat',cat_transformation,cat_columns)
])

preprocess_on = ColumnTransformer(transformers=[
    ('cat',cat_transformation,cat_columns)
])

# Simple Model 

## Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression
model_lr = Pipeline(steps=[
    ("preprocessing",preprocess_st_on),
    ("medel",LinearRegression())
])
model_lr.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessing', ...), ('medel', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [11]:
y_pred = model_lr.predict(X_test)
y_pred

array([ 75290.77269896,  54883.81514076,  73024.67477486,  77236.1090331 ,
        69780.86703845,  45118.08360703,  74169.04111676,  25954.30019217,
        72893.48449931,  66992.48046873,  65294.24236546,  24868.41547379,
        63877.5883645 ,  40933.57082662,  50038.72447493,  68232.59911794,
        75733.30454252,  79530.55378857,  56540.26225813,  51893.40526796,
        66912.01643111,  91933.19229816,  74090.54825129,  45326.83156096,
        38466.6212915 ,  50276.45344358,  84864.85929987,  53261.53373309,
        79535.59595957,  77565.64748023,  38359.28206099,  75058.50918026,
        60066.52500568,  71587.73264318,  63979.61007561,  74551.57755821,
        59599.87768832,  50444.27884948,  72821.16621931,  46626.02793225,
        58990.27897278,  84341.4768561 ,  64628.73601365,  62712.9852778 ,
        40914.7155698 ,  38170.93415884,  47188.40364224,  40244.65240417,
        82145.95290668,  69817.25469329,  52459.68016278,  48987.80917875,
        53921.88663289,  

In [12]:
y_test

84      68545
3037    55481
1497    63628
653     83883
877     73192
        ...  
3568    56511
2992    35915
2655    47905
3314    41077
1837    34333
Name: Future_Income, Length: 999, dtype: int64

### R2,Adj R2 and MSE

In [13]:
from sklearn.metrics import r2_score,mean_absolute_error

r2 = r2_score(y_test,y_pred)
r2

0.881875547542098

In [14]:
n = X_test.shape[0]
p = X_test.shape[1]

adj_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))
adj_r2

0.8801949150884286

In [15]:

mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)


MAE: 4727.702375733062


## KNN


In [16]:
from sklearn.neighbors import KNeighborsRegressor

model_knn = Pipeline(steps=[
    ('preprocessing',preprocess_st_on),
    ('model', KNeighborsRegressor(n_neighbors=17,leaf_size=10,weights="distance"))
])

model_knn.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_neighbors,17
,weights,'distance'
,algorithm,'auto'
,leaf_size,10
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [17]:
y_pred_knn = model_knn.predict(X_test)
y_pred_knn 

array([69198.99515059, 54702.9235079 , 65687.5049969 , 73428.42005477,
       64985.20467155, 46627.64018484, 68611.33000972, 35204.99286178,
       67235.74351934, 64746.12347649, 60630.43786733, 35355.22668807,
       63211.61650932, 48758.92058849, 52549.13283149, 70054.43710949,
       69449.3541379 , 79362.47940487, 58203.03062661, 54858.06442804,
       62947.10424858, 78182.46264132, 66702.36168345, 42196.31751961,
       41325.15168565, 49888.19998485, 77945.35259736, 43664.97033621,
       76168.82002876, 70167.43261817, 45857.86539065, 63752.6626139 ,
       55085.43090964, 65826.33029861, 63531.89117977, 68654.63944531,
       57658.36632413, 55456.70128763, 65146.09762977, 42431.83877962,
       57938.96170908, 76905.76461832, 62652.37064171, 64045.82746534,
       40490.08844015, 40377.14563469, 46972.38199358, 38226.6069237 ,
       78997.57154544, 60748.02408187, 50944.47751609, 56199.28984074,
       54906.19018712, 37878.12534469, 47191.1695558 , 56329.96420138,
      

In [18]:
y_test

84      68545
3037    55481
1497    63628
653     83883
877     73192
        ...  
3568    56511
2992    35915
2655    47905
3314    41077
1837    34333
Name: Future_Income, Length: 999, dtype: int64

In [19]:
r2_knn =  r2_score(y_test,y_pred_knn)
r2_knn

0.7957129400166931

In [20]:
adj_r2 = 1 - (1 - r2_knn) * ((n - 1) / (n - p - 1))
adj_r2

0.7928064168055485

In [21]:
mae_knn = mean_absolute_error(y_test,y_pred_knn)
mae_knn

6192.037095486072

## Decision Tree

In [22]:
from sklearn.tree import DecisionTreeRegressor

model_dt = Pipeline(steps=[
    ('preprocess',preprocess_st_on),
    ('model',DecisionTreeRegressor(max_depth=10,min_samples_leaf=8,min_samples_split=2))
])

model_dt.fit(X_train,y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,8
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [23]:
y_pred_dt = model_dt.predict(X_test)

In [24]:
r2_dt = r2_score(y_test,y_pred_dt)
r2_dt

0.8482435835906208

In [25]:
adj_r2 = 1 - (1 - r2_dt) * ((n - 1) / (n - p - 1))
adj_r2

0.8460844475847963

## SVM

In [26]:
from sklearn.svm import SVR

model_svr = Pipeline(steps=[
    ('preprocess',preprocess_st_on),
    ('model',SVR(kernel="linear"))
])

model_svr.fit(X_train,y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [27]:
y_pred_svr = model_svr.predict(X_test)

In [28]:
r2_svr = r2_score(y_test,y_pred_svr)
r2_svr

0.48482730183039013

# Cross validation

## for linear regression

In [29]:
from sklearn.model_selection import cross_val_score

score_lr = cross_val_score(model_lr,X,y,cv=5,scoring='r2')

print(score_lr.mean())

0.8947707150931086


## for KNN

In [30]:
score_knn = cross_val_score(model_knn,X,y,cv=5,scoring="r2")
score_knn.mean()

np.float64(0.8067854400437288)

# Hyperparameter Tuning

## Grid Search CV

### For KNN

In [31]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_neighbors': [3,5,7,9,11,13,17,25],
    'model__leaf_size': [10,20,30,50,60],
    'model__weights': ['uniform','distance']
}

grid = GridSearchCV(model_knn, param_grid, cv=5,scoring="r2")
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)

Best parameters: {'model__leaf_size': 10, 'model__n_neighbors': 17, 'model__weights': 'distance'}
Best score: 0.8067854400437288


### For Decision Tree 

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

param_grid = {
    'model__max_depth': [10, 15, 20, 25, 30],
    'model__min_samples_split': [2, 5, 10, 15],
    'model__min_samples_leaf': [1, 2, 4, 8]
}

grid = GridSearchCV(model_dt, param_grid, cv=5, scoring='r2')
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best R² score:", grid.best_score_)



Best parameters: {'model__max_depth': 10, 'model__min_samples_leaf': 8, 'model__min_samples_split': 15}
Best R² score: 0.8632187625247063


## Random search cv

In [33]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'model__max_depth': [10, 15, 20, 25, 30],
    'model__min_samples_split': [2, 5, 10, 15],
    'model__min_samples_leaf': [1, 2, 4, 8]
}

random = RandomizedSearchCV(model_dt,param_grid,n_iter=5,cv=5,scoring='r2')

random.fit(X_train,y_train)


result = pd.DataFrame(random.cv_results_)
result[['param_model__min_samples_split','param_model__min_samples_leaf','param_model__max_depth','mean_test_score']]


Unnamed: 0,param_model__min_samples_split,param_model__min_samples_leaf,param_model__max_depth,mean_test_score
0,5,4,25,0.830166
1,2,1,30,0.785823
2,10,4,10,0.853349
3,15,1,25,0.830848
4,2,4,15,0.83186


# Stacking

In [None]:
from sklearn.ensemble import StackingRegressor

base_learner = [
    ('lr',model_lr),
    ('KNN',model_knn),
    ('DT',model_dt),
    ('SVR',model_svr)
]

Meta_learner = LinearRegression()

stacking_rg = StackingRegressor(
    estimators=base_learner,
    final_estimator=Meta_learner,
    cv=5
)

stacking_rg.fit(X_train,y_train)

y_pred_sr = stacking_rg.predict(X_test)

r2_sr = r2_score(y_test,y_pred_sr)

r2_sr

0.8820529374364601

# Bagging

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_rf = Pipeline(steps=[
    ('preprocess',preprocess_st_on),
    ('model',RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=42
))
])

model_rf.fit(X_train,y_train)

y_pred_rf = model_rf.predict(X_test)

r2_rf = r2_score(y_test,y_pred_rf)
r2_rf

In [None]:
param_grid_rf = {
    'model__n_estimators': [100, 200, 500],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2'],
    'model__bootstrap': [True, False]
}
model_rf_best = RandomizedSearchCV(model_rf,param_grid_rf,cv=5,scoring='r2',n_iter=50,n_jobs=-1)

model_rf_best.fit(X_train,y_train)

print(model_rf_best.best_params_)
print(model_rf_best.best_score_)


{'model__n_estimators': 500, 'model__min_samples_split': 2, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': None, 'model__bootstrap': False}
0.8817195331284855


# Boosting

## Ada boost

In [None]:
from sklearn.ensemble import AdaBoostRegressor

model_ada = Pipeline(steps=[
    ('preprocess',preprocess_st_on),
    ('model',AdaBoostRegressor(n_estimators=100,random_state=42))
])
model_ada.fit(X_train,y_train)

y_pred_ada = model_ada.predict(X_test)

r2_ada = r2_score(y_test,y_pred_ada)
r2_ada

0.8717450180671565

## Gradient boost

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model_GBR = Pipeline(steps=[
    ('preprocess',preprocess_st_on),
    ('model',GradientBoostingRegressor(n_estimators=100,random_state=42))
])
model_GBR.fit(X_train,y_train)

y_pred_GBR = model_GBR.predict(X_test)

r2_GBR = r2_score(y_test,y_pred_GBR)
r2_GBR

0.8816188597684004

In [50]:
import xgboost as xgb

model_xgb = Pipeline(steps=[
    ('preprocess',preprocess_st_on),
    ('model',xgb.XGBRegressor(n_estimators=100,random_state=42))
])
model_xgb.fit(X_train,y_train)

y_pred_xgb = model_xgb.predict(X_test)

r2_xgb = r2_score(y_test,y_pred_xgb)
r2_xgb

0.8548287749290466

# For UI

In [52]:
df.head(4)

Unnamed: 0,Age,Gender,Education,Marital_Status,Job_Type,Region,Income,Expenses,Savings,Credit_Score,Years_of_Experience,Loan_Amount,Loan_Approval,Default_Risk,Future_Income
0,56,Female,Bachelor,Single,Education,North,48353,28561,23863,501,37,415204,0,0,56040
1,69,Other,Bachelor,Married,Unemployed,East,57462,39671,21336,751,51,100862,1,0,68516
2,46,Female,Bachelor,Divorced,Healthcare,North,44219,17505,33720,486,26,258727,0,0,49016
3,32,Male,Master,Single,Construction,South,56306,18301,43973,667,14,202228,1,0,77005


In [40]:
df["Region"].unique()

array(['North', 'East', 'South', 'West'], dtype=object)

In [41]:
df.describe()

Unnamed: 0,Age,Income,Expenses,Savings,Credit_Score,Years_of_Experience,Loan_Amount,Loan_Approval,Default_Risk,Future_Income
count,4995.0,4995.0,4995.0,4995.0,4995.0,4995.0,4995.0,4995.0,4995.0,4995.0
mean,43.588388,49793.998198,27318.354755,24508.568769,575.79019,26.112112,255528.770571,0.336537,0.150951,57263.275075
std,14.915848,15034.822512,11145.919769,11521.267038,159.506088,15.005854,143285.538457,0.472572,0.358037,17771.644788
min,18.0,128.0,75.0,-8616.0,300.0,0.0,5157.0,0.0,0.0,3567.0
25%,31.0,39619.0,18955.0,16163.0,434.0,13.0,130287.0,0.0,0.0,44977.5
50%,43.0,49602.0,25890.0,23379.0,576.0,26.0,259756.0,0.0,0.0,57031.0
75%,56.0,59933.0,34355.0,31888.0,714.0,39.0,380594.5,1.0,0.0,69251.5
max,69.0,99145.0,71591.0,65105.0,849.0,55.0,499915.0,1.0,1.0,124306.0


In [51]:
import joblib
joblib.dump(model_xgb,"model_xgb.pkl")


['model_xgb.pkl']

In [44]:
df.columns.to_list()

['Age',
 'Gender',
 'Education',
 'Marital_Status',
 'Job_Type',
 'Region',
 'Income',
 'Expenses',
 'Savings',
 'Credit_Score',
 'Years_of_Experience',
 'Loan_Amount',
 'Loan_Approval',
 'Default_Risk',
 'Future_Income']