In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
df = pd.read_csv('users_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2000 non-null   int64  
 1   current_age        2000 non-null   int64  
 2   retirement_age     2000 non-null   int64  
 3   birth_year         2000 non-null   int64  
 4   birth_month        2000 non-null   int64  
 5   gender             2000 non-null   object 
 6   address            2000 non-null   object 
 7   latitude           2000 non-null   float64
 8   longitude          2000 non-null   float64
 9   per_capita_income  2000 non-null   int64  
 10  yearly_income      2000 non-null   int64  
 11  total_debt         2000 non-null   int64  
 12  credit_score       2000 non-null   int64  
 13  num_credit_cards   2000 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 218.9+ KB


In [4]:
def Encoder(df):
    encoder = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'object':
            if df[col].nunique() <= 5:
                dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
                df = pd.concat([df.drop(columns=[col]), dummies], axis=1)
            else:
                df[col] = encoder.fit_transform(df[col])
    return df

In [5]:
df = Encoder(df)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2000 non-null   int64  
 1   current_age        2000 non-null   int64  
 2   retirement_age     2000 non-null   int64  
 3   birth_year         2000 non-null   int64  
 4   birth_month        2000 non-null   int64  
 5   address            2000 non-null   int64  
 6   latitude           2000 non-null   float64
 7   longitude          2000 non-null   float64
 8   per_capita_income  2000 non-null   int64  
 9   yearly_income      2000 non-null   int64  
 10  total_debt         2000 non-null   int64  
 11  credit_score       2000 non-null   int64  
 12  num_credit_cards   2000 non-null   int64  
 13  gender_Female      2000 non-null   int64  
 14  gender_Male        2000 non-null   int64  
dtypes: float64(2), int64(13)
memory usage: 234.5 KB


In [7]:
def Scaler(df):
    scaler = MinMaxScaler()
    num_col = df.select_dtypes(include=['float64', 'int64']).columns.drop('yearly_income')
    df[num_col] = scaler.fit_transform(df[num_col])
    return df

In [8]:
df = Scaler(df)

In [9]:
df.head(20)

Unnamed: 0,id,current_age,retirement_age,birth_year,birth_month,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards,gender_Female,gender_Male
0,0.412706,0.421687,0.551724,0.571429,0.909091,0.409409,0.329117,0.459004,0.17946,59696,0.247186,0.82973,0.5,1.0,0.0
1,0.873437,0.421687,0.62069,0.571429,1.0,0.303804,0.493056,0.944126,0.232254,77254,0.370642,0.597297,0.5,1.0,0.0
2,0.85943,0.759036,0.586207,0.238095,0.909091,0.745245,0.325893,0.457571,0.139024,33483,0.00038,0.589189,0.5,1.0,0.0
3,0.354177,0.542169,0.448276,0.464286,0.0,0.223223,0.491815,0.941371,1.0,249925,0.391909,0.654054,0.375,1.0,0.0
4,0.582291,0.301205,0.689655,0.690476,0.727273,0.96046,0.418651,0.407428,0.32975,109687,0.356127,0.527027,0.0,0.0,1.0
5,0.034017,0.289157,0.689655,0.702381,0.818182,0.538038,0.512649,0.75832,0.126262,41997,0.0,0.605405,0.25,0.0,1.0
6,0.537769,0.216867,0.586207,0.77381,1.0,0.529029,0.43006,0.81188,0.154819,51500,0.198128,0.518919,0.25,1.0,0.0
7,0.855928,0.096386,0.586207,0.892857,1.0,0.101602,0.610863,0.405224,0.16421,54623,0.222195,0.67027,0.0,0.0,1.0
8,0.558279,0.759036,0.551724,0.238095,0.545455,0.012513,0.482143,0.926714,0.161041,42509,0.005608,0.743243,0.5,1.0,0.0
9,0.876438,0.192771,0.344828,0.809524,0.0,0.862863,0.225446,0.741569,0.114806,38190,0.157404,0.891892,0.0,1.0,0.0


In [10]:
from sklearn.model_selection import train_test_split

x = df.drop('yearly_income', axis=1)
y = df['yearly_income']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) 

# SVR

In [11]:
from sklearn.svm import SVR

In [12]:
svr = SVR(kernel='rbf', C=0.5)

In [13]:
svr.fit(x_train, y_train)

0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,0.5
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [14]:
y_pred = svr.predict(x_test)

In [15]:
from sklearn.metrics import r2_score, mean_absolute_error

In [16]:
svr_score = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [17]:
print(svr_score)
print(mae)

-0.03546368843391545
14323.257160810079


# Linear Regression

In [18]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [19]:
lr.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [20]:
y_pred = lr.predict(x_test)

In [21]:
lr_score = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [22]:
print(lr_score)
print(mae)

0.9521986732868627
2640.966430280642


# Decision Tree

In [23]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(random_state=42)

In [24]:
dt.fit(x_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [25]:
y_pred = dt.predict(x_test)

In [26]:
dt_score = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [27]:
print(dt_score)
print(mae)

0.9204070323907539
1999.92


# Random Forest

In [28]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [29]:
rf.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
y_pred = rf.predict(x_test)

In [31]:
rf_score = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [32]:
print(rf_score)
print(mae)

0.9610246070326249
1526.212375


In [33]:
from tabulate import tabulate

In [34]:
result = [
    ['CVR', svr_score],
    ['Linear Regression', lr_score],
    ['Decision Tree', dt_score],
    ['Random Forest', rf_score],
]

headers = ['Algorithm', 'r2_score']

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.2f')

print(table)

+-------------------+------------+
| Algorithm         |   r2_score |
| CVR               |      -0.04 |
+-------------------+------------+
| Linear Regression |       0.95 |
+-------------------+------------+
| Decision Tree     |       0.92 |
+-------------------+------------+
| Random Forest     |       0.96 |
+-------------------+------------+


In [38]:
from joblib import dump, load

dump(svr, 'income_predict_svr.joblib')
dump(lr, 'income_predict_lr.joblib')
dump(dt, 'income_predict_dt.joblib')
dump(rf, 'income_predict_rf.joblib')

['income_predict_rf.joblib']