In [3]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [4]:
df1 = pd.read_csv("../data/playstore_cleaned (2).csv")
df2 = pd.read_csv("../data/playstore_cleaned_recentyear.csv")

In [5]:
df1.columns, df2.columns

(Index(['Unnamed: 0', 'App', 'Category', 'Rating', 'Reviews', 'Size',
        'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Current Ver',
        'Android Ver', 'Day', 'month', 'year'],
       dtype='object'),
 Index(['Unnamed: 0', 'App', 'Category', 'Rating', 'Reviews', 'Size',
        'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Current Ver',
        'Android Ver', 'Day', 'month', 'year'],
       dtype='object'))

In [6]:
#df2 = df2.rename(columns={"Size (KB)": "Size"})

In [7]:
df = pd.concat([df1, df2], ignore_index=True)

In [8]:
df

Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Current Ver,Android Ver,Day,month,year
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000.0,10000,Free,0.0,Everyone,Art & Design,1.0.0,4.0.3 and up,7,1,2018
1,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2.0.0,4.0.3 and up,15,1,2018
2,2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8700.0,5000000,Free,0.0,Everyone,Art & Design,1.2.4,4.0.3 and up,1,8,2018
3,3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000.0,50000000,Free,0.0,Teen,Art & Design,Varies with device,4.2 and up,8,6,2018
4,4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2800.0,100000,Free,0.0,Everyone,Art & Design;Creativity,1.1,4.4 and up,20,6,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11781,11886,Canva,ART_AND_DESIGN,4.7,12000000,40000.0,100000000,Free,0.0,Everyone,Art & Design,Varies with device,5.0 and up,22,8,2020
11782,11887,Sketchbook,ART_AND_DESIGN,4.3,800000,90000.0,50000000,Free,0.0,Everyone,Art & Design,Varies with device,5.0 and up,1,1,2020
11783,11888,MediBang Paint,ART_AND_DESIGN,4.4,500000,60000.0,10000000,Free,0.0,Everyone,Art & Design,25.3,5.0 and up,1,1,2020
11784,11889,ibis Paint X,ART_AND_DESIGN,4.6,3000000,70000.0,100000000,Free,0.0,Everyone,Art & Design,10.0.6,4.4 and up,1,1,2020


In [9]:
df.drop(axis=1, columns=["Unnamed: 0", "Type", "App", "Genres",
                         "Current Ver", "Android Ver", "Day", "month"],
        inplace=True)

In [10]:
df.head()

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Price,Content Rating,year
0,ART_AND_DESIGN,4.1,159,19000.0,10000,0.0,Everyone,2018
1,ART_AND_DESIGN,3.9,967,14000.0,500000,0.0,Everyone,2018
2,ART_AND_DESIGN,4.7,87510,8700.0,5000000,0.0,Everyone,2018
3,ART_AND_DESIGN,4.5,215644,25000.0,50000000,0.0,Teen,2018
4,ART_AND_DESIGN,4.3,967,2800.0,100000,0.0,Everyone,2018


The App Success Score combines rating and reviews as Rating × log(1 + Reviews) to fairly capture both quality and popularity, with a log transform compressing large review counts and safely handling zeros.


## Log Transformation

The `log(1 + x)` function is used to compress large numerical values and handle zeros safely.
It is especially useful for features like review counts, where values can range from 0 to millions.
By applying `log(1 + x)`, small values remain meaningful, huge values are compressed, and zero values do not cause errors, making the data more suitable for modeling.



In [11]:
df["Score"] = df["Rating"] * np.log(1 + df["Reviews"])

In [12]:
df.drop(axis=1, columns=["Rating", "Reviews"], inplace=True)

In [13]:
df.tail()

Unnamed: 0,Category,Size,Installs,Price,Content Rating,year,Score
11781,ART_AND_DESIGN,40000.0,100000000,0.0,Everyone,2020,76.611961
11782,ART_AND_DESIGN,90000.0,50000000,0.0,Everyone,2020,58.447184
11783,ART_AND_DESIGN,60000.0,10000000,0.0,Everyone,2020,57.738408
11784,ART_AND_DESIGN,70000.0,100000000,0.0,Everyone,2020,68.604967
11785,ART_AND_DESIGN,40000.0,5000000,0.0,Everyone,2020,49.505622


In [14]:
df.shape

(11786, 7)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11786 entries, 0 to 11785
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Category        11786 non-null  object 
 1   Size            11786 non-null  float64
 2   Installs        11786 non-null  int64  
 3   Price           11786 non-null  float64
 4   Content Rating  11786 non-null  object 
 5   year            11786 non-null  int64  
 6   Score           11786 non-null  float64
dtypes: float64(3), int64(2), object(2)
memory usage: 644.7+ KB


**An app's Score (its weighted rating) is one of the most important factors related to its success. But it's not a cause of success that you can know beforehand; it is a result of success. Also only already realsed apps will have this info so the user will not be able to give this parameter.**

In [16]:
df.drop(axis=1, columns=["Score"], inplace=True)

In [17]:
df.head()

Unnamed: 0,Category,Size,Installs,Price,Content Rating,year
0,ART_AND_DESIGN,19000.0,10000,0.0,Everyone,2018
1,ART_AND_DESIGN,14000.0,500000,0.0,Everyone,2018
2,ART_AND_DESIGN,8700.0,5000000,0.0,Everyone,2018
3,ART_AND_DESIGN,25000.0,50000000,0.0,Teen,2018
4,ART_AND_DESIGN,2800.0,100000,0.0,Everyone,2018


In [18]:
df.to_csv("../data/playstore_partially_preprocessed.csv", index=False)

In [19]:
value_counts_category = df["Category"].value_counts()

Putting infrequent element to a single section others to reduce the number of unique elements elements in Category col to improve the training process of model when using OHE.,

In [20]:
mask_for_infrequents = df["Category"].map(value_counts_category) < 11

In [21]:
df.loc[mask_for_infrequents, "Category"] = "OTHERS"

In [22]:
df["Category"].value_counts()

Category
FAMILY                 1975
GAME                   1144
TOOLS                   913
PRODUCTIVITY            521
BUSINESS                467
MEDICAL                 463
COMMUNICATION           428
PERSONALIZATION         406
LIFESTYLE               391
FINANCE                 391
SPORTS                  387
HEALTH_AND_FITNESS      370
PHOTOGRAPHY             362
SOCIAL                  329
NEWS_AND_MAGAZINES      308
SHOPPING                282
TRAVEL_AND_LOCAL        277
BOOKS_AND_REFERENCE     253
DATING                  246
VIDEO_PLAYERS           202
ENTERTAINMENT           187
EDUCATION               170
MAPS_AND_NAVIGATION     146
FOOD_AND_DRINK          146
GAME_ACTION             105
ART_AND_DESIGN           96
HOUSE_AND_HOME           88
LIBRARIES_AND_DEMO       86
AUTO_AND_VEHICLES        85
GAME_ROLE_PLAYING        82
WEATHER                  82
EVENTS                   64
COMICS                   63
PARENTING                60
BEAUTY                   53
MUSIC_AND_A

In [23]:
val_count_rating = df["Content Rating"].value_counts()
val_count_rating

Content Rating
Everyone           9199
Teen               1528
Mature 17+          613
Everyone 10+        441
Adults only 18+       3
Unrated               2
Name: count, dtype: int64

In [24]:
mask_for_rating = df["Content Rating"].map(val_count_rating) < 4
df.loc[mask_for_rating, "Content Rating"] = "Unrated"
df["Content Rating"].value_counts()

Content Rating
Everyone        9199
Teen            1528
Mature 17+       613
Everyone 10+     441
Unrated            5
Name: count, dtype: int64

In [25]:
ord_encoder = OrdinalEncoder(categories=[["Everyone", "Everyone 10+",
                             "Teen", "Mature 17+", "Unrated"]], handle_unknown="use_encoded_value", unknown_value=-1)
oh_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

In [26]:
cat_transformer = ColumnTransformer(transformers=[
     ("ord", ord_encoder, ["Content Rating"]),
     ("ohe", oh_encoder, ["Category"])
])

In [27]:
random_forest_model = RandomForestRegressor(random_state=42)
xgb_model = XGBRegressor(random_state=42, n_estimators=1000, learning_rate=0.05)

In [28]:
pipeline_rf = Pipeline(steps=[
    ('cat_preproc', cat_transformer),
    ('model', random_forest_model)
])

pipeline_xgb = Pipeline(steps=[
    ('cat_preproc', cat_transformer),
    ('model', xgb_model)
])

**log transformation is used with install because install falls into a very large and diverse range (skewed) so the model predicts relatively larger value for even very smaller ones as the larger numbers in the target predominates.**

In [29]:
X = df.drop(axis=1, columns=["Installs"])
y = df["Installs"].map(np.log1p)

In [32]:
scores_XGB = []
best_iters = []
for _ in range(5):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    pipeline_xgb.fit(X_train, y_train, model__eval_set=[(X_valid, y_valid)],
                     model__early_stopping_rounds=5,
                     model__verbose=False )
    best_iters.append(pipeline_xgb.named_steps["model"].best_iteration)
    y_preds = pipeline_xgb.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, y_preds))
    scores_XGB.append(rmse)



TypeError: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [58]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores_rf = -1 * cross_val_score(pipeline_rf, X, y, cv=kf, scoring='neg_root_mean_squared_error')
scores_xgb = -1 * cross_val_score(pipeline_xgb, X, y, cv=kf, scoring='neg_root_mean_squared_error')
print(f"SCORES:\n"
      f"RandomForest: {scores_rf}\n"
      f"XGBoost: {scores_xgb}\n"
      f"Mean RandomForest: {scores_rf.mean()}\n"
      f"Mean XGBoost: {scores_xgb.mean()}\n")

SCORES:
RandomForest: [4.23573025 4.2128512  4.17147608 4.13006015 4.21885539]
XGBoost: [4.2372487  4.21052591 4.17606342 4.12918141 4.22188216]
Mean RandomForest: 4.193794614902709
Mean XGBoost: 4.194980319457544

