In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [36]:
df = pd.read_csv('users_data.csv')

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2000 non-null   int64  
 1   current_age        2000 non-null   int64  
 2   retirement_age     2000 non-null   int64  
 3   birth_year         2000 non-null   int64  
 4   birth_month        2000 non-null   int64  
 5   gender             2000 non-null   object 
 6   address            2000 non-null   object 
 7   latitude           2000 non-null   float64
 8   longitude          2000 non-null   float64
 9   per_capita_income  2000 non-null   int64  
 10  yearly_income      2000 non-null   int64  
 11  total_debt         2000 non-null   int64  
 12  credit_score       2000 non-null   int64  
 13  num_credit_cards   2000 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 218.9+ KB


In [38]:
encoder = LabelEncoder()
scaler = MinMaxScaler()

In [39]:
class DataPreprocessing:
    def __init__(self, df):
        self.df = df.copy()
    
    def fillMissingValues(self):
        for col in self.df.columns:
            if self.df[col].isnull().any():
                if self.df[col].dtype == 'object':
                    self.df[col].fillna(self.df[col].mode()[0], inplace=True)
                else:
                    self.df[col].fillna(self.df[col].mean(), inplace=True)
        return self
    
    def encode(self):
        for col in self.df.columns:
            if self.df[col].dtype == 'object':
                if self.df[col].nunique() <= 5:
                    dummies = pd.get_dummies(self.df[col], prefix=col, dtype=int)
                    self.df = pd.concat([self.df.drop(columns=[col]), dummies], axis=1)
                else:
                    self.df[col] = encoder.fit_transform(self.df[col])
        return self
    
    def scale(self):
        num_cols = self.df.select_dtypes(include=['int64', 'float64']).columns.drop('yearly_income')
        self.df[num_cols] = scaler.fit_transform(self.df[num_cols])
        return self
    
    def log_transformer(self, target='yearly_income'):
        skewness = self.df.skew()
        features_log = skewness[(skewness >= 0.5)].index.tolist()

        if target in features_log:
            features_log.remove(target)
        
        for col in features_log:
            if (self.df[col] > 0).all():
                self.df[col] = np.log1p(self.df[col])
        return self
    
    def getPreprocessed(self):
        return self.df


In [40]:
preprocessing = DataPreprocessing(df)
df = preprocessing.fillMissingValues().encode().scale().log_transformer().getPreprocessed()

In [41]:
df.head(10)

Unnamed: 0,id,current_age,retirement_age,birth_year,birth_month,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards,gender_Female,gender_Male
0,0.412706,0.421687,0.551724,0.571429,0.909091,0.409409,0.329117,0.459004,0.17946,59696,0.247186,0.82973,0.5,1.0,0.0
1,0.873437,0.421687,0.62069,0.571429,1.0,0.303804,0.493056,0.944126,0.232254,77254,0.370642,0.597297,0.5,1.0,0.0
2,0.85943,0.759036,0.586207,0.238095,0.909091,0.745245,0.325893,0.457571,0.139024,33483,0.00038,0.589189,0.5,1.0,0.0
3,0.354177,0.542169,0.448276,0.464286,0.0,0.223223,0.491815,0.941371,1.0,249925,0.391909,0.654054,0.375,1.0,0.0
4,0.582291,0.301205,0.689655,0.690476,0.727273,0.96046,0.418651,0.407428,0.32975,109687,0.356127,0.527027,0.0,0.0,1.0
5,0.034017,0.289157,0.689655,0.702381,0.818182,0.538038,0.512649,0.75832,0.126262,41997,0.0,0.605405,0.25,0.0,1.0
6,0.537769,0.216867,0.586207,0.77381,1.0,0.529029,0.43006,0.81188,0.154819,51500,0.198128,0.518919,0.25,1.0,0.0
7,0.855928,0.096386,0.586207,0.892857,1.0,0.101602,0.610863,0.405224,0.16421,54623,0.222195,0.67027,0.0,0.0,1.0
8,0.558279,0.759036,0.551724,0.238095,0.545455,0.012513,0.482143,0.926714,0.161041,42509,0.005608,0.743243,0.5,1.0,0.0
9,0.876438,0.192771,0.344828,0.809524,0.0,0.862863,0.225446,0.741569,0.114806,38190,0.157404,0.891892,0.0,1.0,0.0


# Import algorithms

In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.feature_selection import RFE

# Import metrics

In [43]:
from sklearn.metrics import r2_score, mean_absolute_error

# Train/Test split

In [44]:
from sklearn.model_selection import train_test_split

x = df.drop('yearly_income', axis=1)
y = df['yearly_income']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Linear Regression

In [45]:
lr_model = LinearRegression()

lr_rfe = RFE(lr_model, n_features_to_select=10)
lr_rfe.fit(x_train, y_train)
y_pred = lr_rfe.predict(x_test)

In [46]:
lr_score = r2_score(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)

In [47]:
lr_score

0.9523735873418111

In [48]:
lr_mae

2627.8991078762733

# Decision Tree

In [49]:
dt = DecisionTreeRegressor(random_state=42)

dt.fit(x_train, y_train)

y_pred = dt.predict(x_test)

In [50]:
dt_score = r2_score(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)

In [51]:
dt_score

0.9204070323907539

In [52]:
dt_mae

1999.92

# Random Forest

In [53]:
rf = RandomForestRegressor(n_estimators=200, random_state=42)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

feat_imp = pd.DataFrame({
    'Feature': x_train.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feat_imp)

              Feature  Importance
8   per_capita_income    0.942876
1         current_age    0.011226
3          birth_year    0.010743
10       credit_score    0.008781
9          total_debt    0.004512
7           longitude    0.003827
0                  id    0.003163
4         birth_month    0.003150
11   num_credit_cards    0.002875
2      retirement_age    0.002787
6            latitude    0.002739
5             address    0.002720
12      gender_Female    0.000368
13        gender_Male    0.000233


In [54]:
rf_score = r2_score(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)

In [55]:
rf_score

0.9635433322739688

In [56]:
rf_mae

1527.818075

# SVM

In [57]:
svr = SVR(kernel='linear', C=0.5)

svr.fit(x_train, y_train)

y_pred = svr.predict(x_test)

In [58]:
svr_score = r2_score(y_test, y_pred)
svr_mae = mean_absolute_error(y_test, y_pred)

In [59]:
svr_score

-0.035123592912133184

In [60]:
svr_mae

14320.11751686801

# KNN

In [61]:
knn = KNeighborsRegressor(n_neighbors=3)

knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

In [62]:
knn_score = r2_score(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)

In [63]:
knn_score

0.2602860470080225

In [64]:
knn_mae

13309.413333333332

# Tabulate

In [65]:
from tabulate import tabulate

In [66]:
result = [
    ['Linear Regression', lr_score, lr_mae],
    ['Decision Tree', dt_score, dt_mae],
    ['Random Forest', rf_score, rf_mae],
    ['SVM', svr_score, svr_mae],
    ['KNN', knn_score, knn_mae],
]

headers = ['Algorithm', 'r2_score', 'mean absolute error']

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.2f')

In [67]:
print(table)

+-------------------+------------+-----------------------+
| Algorithm         |   r2_score |   mean absolute error |
| Linear Regression |       0.95 |               2627.90 |
+-------------------+------------+-----------------------+
| Decision Tree     |       0.92 |               1999.92 |
+-------------------+------------+-----------------------+
| Random Forest     |       0.96 |               1527.82 |
+-------------------+------------+-----------------------+
| SVM               |      -0.04 |              14320.12 |
+-------------------+------------+-----------------------+
| KNN               |       0.26 |              13309.41 |
+-------------------+------------+-----------------------+
