In [71]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [72]:
df1 = pd.read_csv("../data/playstore_cleaned (2).csv")
df2 = pd.read_csv("../data/playstore_cleaned_recentyear.csv")

In [73]:
df1.columns, df2.columns

(Index(['Unnamed: 0', 'App', 'Category', 'Rating', 'Reviews', 'Size',
        'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Current Ver',
        'Android Ver', 'Day', 'month', 'year'],
       dtype='object'),
 Index(['Unnamed: 0', 'App', 'Category', 'Rating', 'Reviews', 'Size',
        'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Current Ver',
        'Android Ver', 'Day', 'month', 'year'],
       dtype='object'))

In [74]:
#df2 = df2.rename(columns={"Size (KB)": "Size"})

In [75]:
df = pd.concat([df1, df2], ignore_index=True)

In [76]:
df

Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Current Ver,Android Ver,Day,month,year
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000.0,10000,Free,0.0,Everyone,Art & Design,1.0.0,4.0.3 and up,7,1,2018
1,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2.0.0,4.0.3 and up,15,1,2018
2,2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8700.0,5000000,Free,0.0,Everyone,Art & Design,1.2.4,4.0.3 and up,1,8,2018
3,3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000.0,50000000,Free,0.0,Teen,Art & Design,Varies with device,4.2 and up,8,6,2018
4,4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2800.0,100000,Free,0.0,Everyone,Art & Design;Creativity,1.1,4.4 and up,20,6,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11781,11886,Canva,ART_AND_DESIGN,4.7,12000000,40000.0,100000000,Free,0.0,Everyone,Art & Design,Varies with device,5.0 and up,22,8,2020
11782,11887,Sketchbook,ART_AND_DESIGN,4.3,800000,90000.0,50000000,Free,0.0,Everyone,Art & Design,Varies with device,5.0 and up,1,1,2020
11783,11888,MediBang Paint,ART_AND_DESIGN,4.4,500000,60000.0,10000000,Free,0.0,Everyone,Art & Design,25.3,5.0 and up,1,1,2020
11784,11889,ibis Paint X,ART_AND_DESIGN,4.6,3000000,70000.0,100000000,Free,0.0,Everyone,Art & Design,10.0.6,4.4 and up,1,1,2020


In [77]:
df["year"].value_counts()

year
2018    7350
2017    1869
2016     804
2020     534
2015     465
2014     216
2024     125
2013     114
2023      91
2021      65
2022      63
2012      37
2011      20
2025      16
2026       9
2010       4
2027       3
2019       1
Name: count, dtype: int64

In [78]:
df.head()

Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Current Ver,Android Ver,Day,month,year
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000.0,10000,Free,0.0,Everyone,Art & Design,1.0.0,4.0.3 and up,7,1,2018
1,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2.0.0,4.0.3 and up,15,1,2018
2,2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8700.0,5000000,Free,0.0,Everyone,Art & Design,1.2.4,4.0.3 and up,1,8,2018
3,3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000.0,50000000,Free,0.0,Teen,Art & Design,Varies with device,4.2 and up,8,6,2018
4,4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2800.0,100000,Free,0.0,Everyone,Art & Design;Creativity,1.1,4.4 and up,20,6,2018


In [79]:
df["Installs"].to_csv("../data/installs.csv", index=False)

In [80]:
df.drop(axis=1, columns=["Unnamed: 0", "Type", "App", "Genres",
                         "Current Ver", "Android Ver", "Day", "month"],
        inplace=True)

In [81]:
df.head()

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Price,Content Rating,year
0,ART_AND_DESIGN,4.1,159,19000.0,10000,0.0,Everyone,2018
1,ART_AND_DESIGN,3.9,967,14000.0,500000,0.0,Everyone,2018
2,ART_AND_DESIGN,4.7,87510,8700.0,5000000,0.0,Everyone,2018
3,ART_AND_DESIGN,4.5,215644,25000.0,50000000,0.0,Teen,2018
4,ART_AND_DESIGN,4.3,967,2800.0,100000,0.0,Everyone,2018


The App Success Score combines rating and reviews as Rating × log(1 + Reviews) to fairly capture both quality and popularity, with a log transform compressing large review counts and safely handling zeros.


## Log Transformation

The `log(1 + x)` function is used to compress large numerical values and handle zeros safely.
It is especially useful for features like review counts, where values can range from 0 to millions.
By applying `log(1 + x)`, small values remain meaningful, huge values are compressed, and zero values do not cause errors, making the data more suitable for modeling.



In [82]:
df["Score"] = df["Rating"] * np.log(1 + df["Reviews"])

In [83]:
df.drop(axis=1, columns=["Rating", "Reviews"], inplace=True)

In [84]:
df.tail()

Unnamed: 0,Category,Size,Installs,Price,Content Rating,year,Score
11781,ART_AND_DESIGN,40000.0,100000000,0.0,Everyone,2020,76.611961
11782,ART_AND_DESIGN,90000.0,50000000,0.0,Everyone,2020,58.447184
11783,ART_AND_DESIGN,60000.0,10000000,0.0,Everyone,2020,57.738408
11784,ART_AND_DESIGN,70000.0,100000000,0.0,Everyone,2020,68.604967
11785,ART_AND_DESIGN,40000.0,5000000,0.0,Everyone,2020,49.505622


In [85]:
df.shape

(11786, 7)

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11786 entries, 0 to 11785
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Category        11786 non-null  object 
 1   Size            11786 non-null  float64
 2   Installs        11786 non-null  int64  
 3   Price           11786 non-null  float64
 4   Content Rating  11786 non-null  object 
 5   year            11786 non-null  int64  
 6   Score           11786 non-null  float64
dtypes: float64(3), int64(2), object(2)
memory usage: 644.7+ KB


**An app's Score (its weighted rating) is one of the most important factors related to its success. But it's not a cause of success that you can know beforehand; it is a result of success. Also only already realsed apps will have this info so the user will not be able to give this parameter.**

In [87]:
df.drop(axis=1, columns=["Score"], inplace=True)

In [88]:
df.head()

Unnamed: 0,Category,Size,Installs,Price,Content Rating,year
0,ART_AND_DESIGN,19000.0,10000,0.0,Everyone,2018
1,ART_AND_DESIGN,14000.0,500000,0.0,Everyone,2018
2,ART_AND_DESIGN,8700.0,5000000,0.0,Everyone,2018
3,ART_AND_DESIGN,25000.0,50000000,0.0,Teen,2018
4,ART_AND_DESIGN,2800.0,100000,0.0,Everyone,2018


In [89]:
df.shape

(11786, 6)

In [90]:
df.to_csv("../data/preprocessed_data.csv", index=False)

In [91]:
ord_encoder = OrdinalEncoder(categories=[["Everyone", "Everyone 10+",
                             "Teen", "Mature 17+", "Unrated"]], handle_unknown="use_encoded_value", unknown_value=-1)
oh_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

In [92]:
cat_transformer = ColumnTransformer(transformers=[
     ("ord", ord_encoder, ["Content Rating"]),
     ("ohe", oh_encoder, ["Category"])

], remainder='passthrough')

In [93]:
random_forest_model = RandomForestRegressor(random_state=42)
xgb_model = XGBRegressor(random_state=42)

In [94]:
pipeline_rf = Pipeline(steps=[
    ('cat_preproc', cat_transformer),
    ('model', random_forest_model)
])

pipeline_xgb = Pipeline(steps=[
    ('cat_preproc', cat_transformer),
    ('model', xgb_model)
])

**log transformation is used with install because install falls into a very large and diverse range (skewed) so the model predicts relatively larger value for even very smaller ones as the larger numbers in the target predominates.**

In [95]:
drop_index = df.loc[df["Category"] == "PHOTO_AND_VIDEO", "Category"].index
df.drop(index=drop_index, inplace=True)
#Identified only a single example have this category so dropping it.

In [96]:
X = df.drop(axis=1, columns=["Installs"])
y = df["Installs"].map(np.log1p)

In [97]:
df.dtypes


Category           object
Size              float64
Installs            int64
Price             float64
Content Rating     object
year                int64
dtype: object

In [98]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
value_counts_category = X_train["Category"].value_counts()
value_counts_category

Category
FAMILY                 1599
GAME                    921
TOOLS                   725
PRODUCTIVITY            418
MEDICAL                 368
BUSINESS                360
COMMUNICATION           337
LIFESTYLE               319
PERSONALIZATION         316
FINANCE                 315
SPORTS                  307
HEALTH_AND_FITNESS      295
PHOTOGRAPHY             289
SOCIAL                  260
NEWS_AND_MAGAZINES      240
TRAVEL_AND_LOCAL        233
SHOPPING                231
BOOKS_AND_REFERENCE     208
DATING                  203
VIDEO_PLAYERS           157
ENTERTAINMENT           150
EDUCATION               135
FOOD_AND_DRINK          122
MAPS_AND_NAVIGATION     109
GAME_ACTION              86
LIBRARIES_AND_DEMO       74
ART_AND_DESIGN           74
GAME_ROLE_PLAYING        71
HOUSE_AND_HOME           66
WEATHER                  66
AUTO_AND_VEHICLES        61
EVENTS                   54
COMICS                   47
PARENTING                45
BEAUTY                   39
MUSIC_AND_A

In [100]:
X_train["Category"].unique()

array(['BOOKS_AND_REFERENCE', 'FAMILY', 'VIDEO_PLAYERS', 'BUSINESS',
       'HOUSE_AND_HOME', 'DATING', 'GAME', 'LIBRARIES_AND_DEMO',
       'FINANCE', 'HEALTH_AND_FITNESS', 'MEDICAL', 'ENTERTAINMENT',
       'TRAVEL_AND_LOCAL', 'GAME_ACTION', 'LIFESTYLE',
       'GAME_ROLE_PLAYING', 'EVENTS', 'PRODUCTIVITY', 'FOOD_AND_DRINK',
       'NEWS_AND_MAGAZINES', 'SPORTS', 'TOOLS', 'GAME_SIMULATION',
       'SOCIAL', 'PHOTOGRAPHY', 'AUTO_AND_VEHICLES', 'SHOPPING',
       'EDUCATION', 'COMMUNICATION', 'PERSONALIZATION', 'GAME_CARD',
       'ART_AND_DESIGN', 'PARENTING', 'MUSIC_AND_AUDIO', 'COMICS',
       'MAPS_AND_NAVIGATION', 'WEATHER', 'GAME_BOARD', 'GAME_STRATEGY',
       'GAME_SPORTS', 'BEAUTY', 'GAME_ADVENTURE', 'GAME_MUSIC',
       'GAME_RACING', 'GAME_PUZZLE', 'GAME_WORD'], dtype=object)

In [101]:
X_train["Content Rating"].unique()

array(['Everyone', 'Teen', 'Mature 17+', 'Everyone 10+',
       'Adults only 18+', 'Unrated'], dtype=object)

In [102]:
infreq_cols = value_counts_category[value_counts_category < 11].index
X_train["Category"] = X_train["Category"].replace(infreq_cols, "GAME")
X_valid["Category"] = X_valid["Category"].replace(infreq_cols, "GAME")
X_train["Category"].value_counts()

Category
FAMILY                 1599
GAME                    956
TOOLS                   725
PRODUCTIVITY            418
MEDICAL                 368
BUSINESS                360
COMMUNICATION           337
LIFESTYLE               319
PERSONALIZATION         316
FINANCE                 315
SPORTS                  307
HEALTH_AND_FITNESS      295
PHOTOGRAPHY             289
SOCIAL                  260
NEWS_AND_MAGAZINES      240
TRAVEL_AND_LOCAL        233
SHOPPING                231
BOOKS_AND_REFERENCE     208
DATING                  203
VIDEO_PLAYERS           157
ENTERTAINMENT           150
EDUCATION               135
FOOD_AND_DRINK          122
MAPS_AND_NAVIGATION     109
GAME_ACTION              86
ART_AND_DESIGN           74
LIBRARIES_AND_DEMO       74
GAME_ROLE_PLAYING        71
HOUSE_AND_HOME           66
WEATHER                  66
AUTO_AND_VEHICLES        61
EVENTS                   54
COMICS                   47
PARENTING                45
BEAUTY                   39
MUSIC_AND_A

Putting
infrequent
element
to
a
single
section
others
to
reduce
the
number
of
unique
elements
elements in Category
col
to
improve
the
training
process
of
model
when
using
OHE.

In [103]:
param_grid_rf = {
    "model__n_estimators": [500, 1000, 1500],
    "model__max_depth": [None, 10, 20, 30],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4]
}

grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, verbose=True, n_jobs=-1)

grid_search_rf.fit(X_train, y_train)




Fitting 5 folds for each of 108 candidates, totalling 540 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [None, 10, ...], 'model__min_samples_leaf': [1, 2, ...], 'model__min_samples_split': [2, 5, ...], 'model__n_estimators': [500, 1000, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('ord', ...), ('ohe', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['Everyone', 'Everyone 10+', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,1500
,criterion,'squared_error'
,max_depth,20
,min_samples_split,10
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [104]:
param_grid_xgb = {
    "model__n_estimators": [500, 1000, 1500, 2000],
    "model__max_depth": [3, 5, 7, 10, 12],
    "model__learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3]
}

grid_search_xgb = GridSearchCV(pipeline_xgb, param_grid_xgb, cv=5, verbose=True, n_jobs=-1)

grid_search_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'model__learning_rate': [0.01, 0.05, ...], 'model__max_depth': [3, 5, ...], 'model__n_estimators': [500, 1000, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('ord', ...), ('ohe', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['Everyone', 'Everyone 10+', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [105]:
print(f"tuned xgb:\n"
      f"{grid_search_xgb.best_params_}\n"
      f"tuned rf:\n"
      f"{grid_search_rf.best_params_}\n")

tuned xgb:
{'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__n_estimators': 500}
tuned rf:
{'model__max_depth': 20, 'model__min_samples_leaf': 2, 'model__min_samples_split': 10, 'model__n_estimators': 1500}



Secondary Tuning

In [106]:
param_grid_rf2 = {
    "model__n_estimators": [400, 500, 600],
    "model__max_depth": [ 17, 20, 23],
    "model__min_samples_split": [8, 10, 12],
    "model__min_samples_leaf": [1, 2, 3]}

grid_search_rf2 = GridSearchCV(pipeline_rf, param_grid_rf2, cv=5, verbose=True, n_jobs=-1)

grid_search_rf2.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [17, 20, ...], 'model__min_samples_leaf': [1, 2, ...], 'model__min_samples_split': [8, 10, ...], 'model__n_estimators': [400, 500, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('ord', ...), ('ohe', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['Everyone', 'Everyone 10+', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,600
,criterion,'squared_error'
,max_depth,20
,min_samples_split,12
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [107]:
param_grid_xgb2 = {
    "model__n_estimators": [300, 500, 700],
    "model__max_depth": [4, 5, 6],
    "model__learning_rate": [0.025, 0.05, 0.075]
}

grid_search_xgb2 = GridSearchCV(pipeline_xgb, param_grid_xgb2, cv=5, verbose=True, n_jobs=-1)

grid_search_xgb2.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'model__learning_rate': [0.025, 0.05, ...], 'model__max_depth': [4, 5, ...], 'model__n_estimators': [300, 500, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('ord', ...), ('ohe', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['Everyone', 'Everyone 10+', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [108]:
print(f"secondary tuned xgb:\n"
      f"{grid_search_xgb2.best_params_}\n"
      f"secondary tuned rf:\n"
      f"{grid_search_rf2.best_params_}\n")

secondary tuned xgb:
{'model__learning_rate': 0.05, 'model__max_depth': 6, 'model__n_estimators': 300}
secondary tuned rf:
{'model__max_depth': 20, 'model__min_samples_leaf': 2, 'model__min_samples_split': 12, 'model__n_estimators': 600}



In [109]:
model_RF = RandomForestRegressor(n_estimators=600, max_depth=20, min_samples_split=12,
                                 min_samples_leaf=2, random_state=42)
model_XGB = XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.05, random_state=42)

In [110]:
pipeline_RF = Pipeline(steps=[
    ('cat_preproc', cat_transformer),
    ('model', model_RF)
])

pipeline_XGB = Pipeline(steps=[
    ('cat_preproc', cat_transformer),
    ('model', model_XGB)
])

In [111]:
kf = KFold(n_splits=5, shuffle=True)
scores_RF = -1 * cross_val_score(pipeline_RF, X, y, cv=kf,
                                 scoring='neg_root_mean_squared_error', n_jobs=-1)
scores_XGB = -1 * cross_val_score(pipeline_XGB, X, y, cv=kf,
                                  scoring='neg_root_mean_squared_error', n_jobs=-1)
print(f"SCORES:\n"
      f"RandomForest: {scores_RF}\n"
      f"XGBoost: {scores_XGB}\n"
      f"Mean RandomForest: {scores_RF.mean()}\n"
      f"Mean XGBoost: {scores_XGB.mean()}\n")

SCORES:
RandomForest: [3.45932741 3.46405854 3.45322081 3.35729297 3.33831077]
XGBoost: [3.37519284 3.45578998 3.47785796 3.32877167 3.32737786]
Mean RandomForest: 3.414442099505527
Mean XGBoost: 3.3929980629727288

