# **Анализ данных Google Play Store Apps**
## Подготовка данных

In [1]:
import kagglehub
import numpy as np
import pandas as pd
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = kagglehub.dataset_download("gauthamp10/google-playstore-apps")

print("Path to dataset files:", f"{path}\\Google-Playstore.csv")

Path to dataset files: C:\Users\Frister\.cache\kagglehub\datasets\gauthamp10\google-playstore-apps\versions\7\Google-Playstore.csv


In [3]:
full_data = pd.read_csv(path + "\\Google-Playstore.csv")

In [4]:
def parse_size(val):
    if pd.isna(val):
        return np.nan
    if isinstance(val, str):
        if val.endswith('M'):
            return float(val[:-1].replace(',', ''))
        if val.endswith('G'):
            return float(val[:-1].replace(',', '')) * 1000.0
        if val.endswith('k'):
            v = val[:-1].replace(',', '')
            try:
                return float(v) / 1000.0
            except:
                return np.nan
    return np.nan

df = full_data.drop_duplicates().dropna(subset=['Rating', 'Category', 'Installs']).copy()
df['Installs'] = df['Installs'].str.replace(r'[+,]', '', regex=True).astype(int)

df['Size'] = df['Size'].apply(parse_size)
df = df.dropna(subset=['Size'])

In [5]:
df.head()

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,...,Developer Website,Developer Email,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice,Scraped Time
0,Gakondo,com.ishakwe.gakondo,Adventure,0.0,0.0,10,10.0,15,True,0.0,...,https://beniyizibyose.tk/#/,jean21101999@gmail.com,"Feb 26, 2020","Feb 26, 2020",Everyone,https://beniyizibyose.tk/projects/,False,False,False,2021-06-15 20:19:35
1,Ampere Battery Info,com.webserveis.batteryinfo,Tools,4.4,64.0,5000,5000.0,7662,True,0.0,...,https://webserveis.netlify.app/,webserveis@gmail.com,"May 21, 2020","May 06, 2021",Everyone,https://dev4phones.wordpress.com/licencia-de-uso/,True,False,False,2021-06-15 20:19:35
2,Vibook,com.doantiepvien.crm,Productivity,0.0,0.0,50,50.0,58,True,0.0,...,,vnacrewit@gmail.com,"Aug 9, 2019","Aug 19, 2019",Everyone,https://www.vietnamairlines.com/vn/en/terms-an...,False,False,False,2021-06-15 20:19:35
3,Smart City Trichy Public Service Vehicles 17UC...,cst.stJoseph.ug17ucs548,Communication,5.0,5.0,10,10.0,19,True,0.0,...,http://www.climatesmarttech.com/,climatesmarttech2@gmail.com,"Sep 10, 2018","Oct 13, 2018",Everyone,,True,False,False,2021-06-15 20:19:35
4,GROW.me,com.horodyski.grower,Tools,0.0,0.0,100,100.0,478,True,0.0,...,http://www.horodyski.com.pl,rmilekhorodyski@gmail.com,"Feb 21, 2020","Nov 12, 2018",Everyone,http://www.horodyski.com.pl,False,False,False,2021-06-15 20:19:35


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2237013 entries, 0 to 2312943
Data columns (total 24 columns):
 #   Column             Dtype  
---  ------             -----  
 0   App Name           object 
 1   App Id             object 
 2   Category           object 
 3   Rating             float64
 4   Rating Count       float64
 5   Installs           int64  
 6   Minimum Installs   float64
 7   Maximum Installs   int64  
 8   Free               bool   
 9   Price              float64
 10  Currency           object 
 11  Size               float64
 12  Minimum Android    object 
 13  Developer Id       object 
 14  Developer Website  object 
 15  Developer Email    object 
 16  Released           object 
 17  Last Updated       object 
 18  Content Rating     object 
 19  Privacy Policy     object 
 20  Ad Supported       bool   
 21  In App Purchases   bool   
 22  Editors Choice     bool   
 23  Scraped Time       object 
dtypes: bool(4), float64(5), int64(2), object(13)
memory usa

## Предсказание рейтинга (регрессия)

In [7]:
features = ['Free', 'Size', 'Installs', 'Price', 'Release_Year'] + \
           [col for col in df.columns if col.startswith('Category_') or col.startswith('Content Rating_')]
data = df[df['Rating'] > 0].copy()
data = pd.get_dummies(data, columns=['Category', 'Content Rating'], drop_first=True)
bool_cols = ['Free', 'Ad Supported', 'In App Purchases', 'Editors Choice']
for col in bool_cols:
    data[col] = data[col].astype(int)
data['Released'] = pd.to_datetime(data['Released'], errors='coerce')
data['Release_Year'] = data['Released'].dt.year
data.reset_index()

X = data[features]
y = data['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor(tree_method='hist', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

y_pred = np.clip(y_pred, 1, 5)

print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
print(f"R2: {r2_score(y_test, y_pred):.4f}")

MAE: 0.4946
RMSE: 0.6521
R2: 0.1080


In [11]:
model = XGBRegressor(tree_method='hist', random_state=42)

mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
rmse_scorer = make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)), greater_is_better=False)
r2_scorer = make_scorer(r2_score)

cv = KFold(n_splits=5, shuffle=True, random_state=42)

mae_scores = cross_val_score(model, X, y, cv=cv, scoring=mae_scorer, n_jobs=-1)
rmse_scores = cross_val_score(model, X, y, cv=cv, scoring=rmse_scorer, n_jobs=-1)
r2_scores = cross_val_score(model, X, y, cv=cv, scoring=r2_scorer, n_jobs=-1)

print(f"MAE:  {-np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")
print(f"RMSE: {-np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"R2:    {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}")

MAE:  0.4948 ± 0.0007
RMSE: 0.6528 ± 0.0013
R2:    0.1067 ± 0.0008
