In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import graphviz
import os
from IPython.display import Image

# Carregar o dataset
data = pd.read_csv('Data/Google-Playstore.csv', nrows=8000)

# Remover colunas indesejadas
data = data.drop(['App Name', 'App Id', 'Minimum Installs', 'Maximum Installs', 'Currency',
                  'Developer Id', 'Developer Website', 'Developer Email', 'Released',
                  'Last Updated', 'Privacy Policy', 'Scraped Time'], axis=1)

# Inspecionar os primeiros registros e verificar valores ausentes
print("Dados iniciais:")
print(data.head())
print(data.info())
print("Valores ausentes antes da imputação:")
print(data.isnull().sum())

# Remover linhas onde a variável alvo 'Installs' é nula
data = data.dropna(subset=['Installs'])

# Tratar a coluna 'Installs'
data['Installs'] = data['Installs'].str.replace('[+,]', '', regex=True)
data['Installs'] = pd.to_numeric(data['Installs'], errors='coerce').astype('Int64')

print("Exemplos de valores na coluna 'Installs' após a conversão:")
print(data['Installs'].head(10))

print("Valores ausentes após remoção de linhas com 'Installs' nulos:")
print(data.isnull().sum())

def handle_missing_values(df):
    df['Category'].fillna(df['Category'].mode()[0], inplace=True)
    df['Rating'].fillna(df['Rating'].mean(), inplace=True)
    df['Rating Count'].fillna(df['Rating Count'].mean(), inplace=True)
    df['Free'].fillna(True, inplace=True)
    df['Price'].fillna(0.0, inplace=True)
    df['Size'] = df['Size'].apply(size_to_mb)
    df['Size'].fillna(df['Size'].mean(), inplace=True)
    df['Minimum Android'].fillna(df['Minimum Android'].mode()[0], inplace=True)
    df['Content Rating'].fillna(df['Content Rating'].mode()[0], inplace=True)
    df['Ad Supported'].fillna(True, inplace=True)
    df['In App Purchases'].fillna(False, inplace=True)
    df['Editors Choice'].fillna(False, inplace=True)

def size_to_mb(size):
    if pd.isna(size):
        return np.nan
    if isinstance(size, str):
        if 'M' in size or 'm' in size:
            return float(size.replace('M', '').replace('m', '').replace(',', '.'))
        elif 'K' in size or 'k' in size:
            return float(size.replace('K', '').replace('k', '').replace(',', '.')) / 1024
        elif 'G' in size or 'g' in size:
            return float(size.replace('G', '').replace('g', '').replace(',', '.')) * 1024
    return np.nan

def parse_android_version(version):
    if pd.isna(version):
        return np.nan
    if 'Varies with device' in version:
        return np.nan
    if 'and up' in version:
        version = version.replace('and up', '').strip()
    if '-' in version:
        version = version.split('-')[0].strip()
    version = version.replace('W', '').strip()
    try:
        return float(version)
    except ValueError:
        return np.nan

handle_missing_values(data)
data['Minimum Android'] = data['Minimum Android'].apply(parse_android_version)
mean_android_version = data['Minimum Android'].mean()
data['Minimum Android'].fillna(mean_android_version, inplace=True)

data['Rating Count'] = data['Rating Count'].astype(int)
data['Minimum Android'] = [int(x * 10) / 10 for x in data['Minimum Android']]

label_encoder = LabelEncoder()
data['Category'] = label_encoder.fit_transform(data['Category'])
category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
data['Content Rating'] = label_encoder.fit_transform(data['Content Rating'])
content_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Content Mapping ", content_mapping, "\nCategory Mapping ", category_mapping)

# Preparação dos dados
X = data[['Category', 'Rating','Rating Count', 'Free','Price','Size','Minimum Android','Content Rating', 'Ad Supported','In App Purchases','Editors Choice']]
y = data['Installs']

# Aplicar PCA para redução de dimensionalidade
pca = PCA(n_components=5, svd_solver='auto')
X_reduced = pca.fit_transform(X)

# Função para avaliar o modelo
def evaluate_model_cv(name, model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    print(f"Model: {name}")
    print(f"R^2 scores: {scores}")
    print(f"Mean R^2: {scores.mean():.4f}")
    print(f"Standard Deviation of R^2: {scores.std():.4f}\n")

print("Tamanho atual do dataframe:", data.shape)



Dados iniciais:
        Category  Rating  Rating Count Installs  Free  Price  Size  \
0      Adventure     0.0           0.0      10+  True    0.0   10M   
1          Tools     4.4          64.0   5,000+  True    0.0  2.9M   
2   Productivity     0.0           0.0      50+  True    0.0  3.7M   
3  Communication     5.0           5.0      10+  True    0.0  1.8M   
4          Tools     0.0           0.0     100+  True    0.0  6.2M   

  Minimum Android Content Rating  Ad Supported  In App Purchases  \
0      7.1 and up       Everyone         False             False   
1      5.0 and up       Everyone          True             False   
2    4.0.3 and up       Everyone         False             False   
3    4.0.3 and up       Everyone          True             False   
4      4.1 and up       Everyone         False             False   

   Editors Choice  
0           False  
1           False  
2           False  
3           False  
4           False  
<class 'pandas.core.frame.DataFram

In [2]:
# Avaliar modelos com Cross-Validation
evaluate_model_cv('Linear Regression', LinearRegression(), X_reduced, y)

Model: Linear Regression
R^2 scores: [ 0.76497533  0.6647855  -0.47808262 -4.53693337  0.63532503]
Mean R^2: -0.5900
Standard Deviation of R^2: 2.0250


In [3]:
evaluate_model_cv('Ridge Regression', Ridge(alpha=1.0), X_reduced, y)

Model: Ridge Regression
R^2 scores: [ 0.76497534  0.66478551 -0.4780826  -4.53693336  0.63532506]
Mean R^2: -0.5900
Standard Deviation of R^2: 2.0250


In [4]:
evaluate_model_cv('Lasso Regression', Lasso(alpha=1.0), X_reduced, y)

Model: Lasso Regression
R^2 scores: [ 0.76497534  0.66478551 -0.47808256 -4.53693332  0.63532509]
Mean R^2: -0.5900
Standard Deviation of R^2: 2.0250


In [None]:
evaluate_model_cv('SVR (linear kernel)', SVR(kernel='linear'), X_reduced, y)

In [None]:
evaluate_model_cv('SVR (rbf kernel)', SVR(kernel='rbf'), X_reduced, y)

In [6]:
evaluate_model_cv('K-NN', KNeighborsRegressor(n_neighbors=2), X_reduced, y)

Model: K-NN
R^2 scores: [ 0.75579882  0.16775175  0.51252603 -3.07452298  0.34924966]
Mean R^2: -0.2578
Standard Deviation of R^2: 1.4216


In [7]:
evaluate_model_cv('Decision Tree', DecisionTreeRegressor(), X_reduced, y)

Model: Decision Tree
R^2 scores: [ 0.95492421  0.16604907  0.24814682  0.2940182  -0.6024879 ]
Mean R^2: 0.2121
Standard Deviation of R^2: 0.4951


In [9]:
evaluate_model_cv('Random Forest', RandomForestRegressor(n_estimators=200), X_reduced, y)

Model: Random Forest
R^2 scores: [0.87735871 0.49560812 0.52414925 0.15851244 0.49881952]
Mean R^2: 0.5109
Standard Deviation of R^2: 0.2276


In [None]:
evaluate_model_cv('Neural Network (single layer)', MLPRegressor(hidden_layer_sizes=(10000,), max_iter=10000), X_reduced, y)



In [0]:
evaluate_model_cv('Neural Network (multi layer)', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=10000), X_reduced, y)
