In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import graphviz
import os
from IPython.display import Image


# Carregar o dataset
data = pd.read_csv('Data/Google-Playstore.csv', nrows=8000)

# Remover colunas indesejadas
data = data.drop(['App Name', 'App Id', 'Minimum Installs', 'Maximum Installs', 'Currency',
                  'Developer Id', 'Developer Website', 'Developer Email', 'Released',
                  'Last Updated', 'Privacy Policy', 'Scraped Time'], axis=1)

# Inspecionar os primeiros registros e verificar valores ausentes
print("Dados iniciais:")
print(data.head())
print(data.info())
print("Valores ausentes antes da imputação:")
print(data.isnull().sum())

# Remover linhas onde a variável alvo 'Installs' é nula
data = data.dropna(subset=['Installs'])

# Tratar a coluna 'Installs'
data['Installs'] = data['Installs'].str.replace('[+,]', '', regex=True)
data['Installs'] = pd.to_numeric(data['Installs'], errors='coerce').astype('Int64')

print("Exemplos de valores na coluna 'Installs' após a conversão:")
print(data['Installs'].head(10))

print("Valores ausentes após remoção de linhas com 'Installs' nulos:")
print(data.isnull().sum())

def handle_missing_values(df):
    df['Category'].fillna(df['Category'].mode()[0], inplace=True)
    df['Rating'].fillna(df['Rating'].mean(), inplace=True)
    df['Rating Count'].fillna(df['Rating Count'].mean(), inplace=True)
    df['Free'].fillna(True, inplace=True)
    df['Price'].fillna(0.0, inplace=True)
    df['Size'] = df['Size'].apply(size_to_mb)
    df['Size'].fillna(df['Size'].mean(), inplace=True)
    df['Minimum Android'].fillna(df['Minimum Android'].mode()[0], inplace=True)
    df['Content Rating'].fillna(df['Content Rating'].mode()[0], inplace=True)
    df['Ad Supported'].fillna(True, inplace=True)
    df['In App Purchases'].fillna(False, inplace=True)
    df['Editors Choice'].fillna(False, inplace=True)

def size_to_mb(size):
    if pd.isna(size):
        return np.nan
    if isinstance(size, str):
        if 'M' in size or 'm' in size:
            return float(size.replace('M', '').replace('m', '').replace(',', '.'))
        elif 'K' in size or 'k' in size:
            return float(size.replace('K', '').replace('k', '').replace(',', '.')) / 1024
        elif 'G' in size or 'g' in size:
            return float(size.replace('G', '').replace('g', '').replace(',', '.')) * 1024
    return np.nan

def parse_android_version(version):
    if pd.isna(version):
        return np.nan
    if 'Varies with device' in version:
        return np.nan
    if 'and up' in version:
        version = version.replace('and up', '').strip()
    if '-' in version:
        version = version.split('-')[0].strip()
    version = version.replace('W', '').strip()
    try:
        return float(version)
    except ValueError:
        return np.nan

handle_missing_values(data)
data['Minimum Android'] = data['Minimum Android'].apply(parse_android_version)
mean_android_version = data['Minimum Android'].mean()
data['Minimum Android'].fillna(mean_android_version, inplace=True)

data['Rating Count'] = data['Rating Count'].astype(int)
data['Minimum Android'] = [int(x * 10) / 10 for x in data['Minimum Android']]

label_encoder_category = LabelEncoder()
label_encoder_content_rating = LabelEncoder()
data['Category'] = label_encoder_category.fit_transform(data['Category'])
category_mapping = dict(zip(label_encoder_category.classes_, label_encoder_category.transform(label_encoder_category.classes_)))
data['Content Rating'] = label_encoder_content_rating.fit_transform(data['Content Rating'])
content_mapping = dict(zip(label_encoder_content_rating.classes_, label_encoder_content_rating.transform(label_encoder_content_rating.classes_)))

print("Content Mapping ", content_mapping, "\nCategory Mapping ", category_mapping)

# Preparação dos dados
X = data[['Category', 'Rating','Rating Count', 'Free','Price','Size','Minimum Android','Content Rating', 'Ad Supported','In App Purchases','Editors Choice']]
y = data['Installs']

# Aplicar PCA para redução de dimensionalidade
#pca = PCA(n_components=5, svd_solver='auto')
#X_reduced = pca.fit_transform(X)

# Dividir o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Função para avaliar o modelo
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f"Model: {name}")
    print(f"R^2: {r2:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}\n")
    return model

print("Tamanho atual do dataframe:", data.shape)

models = {}

Dados iniciais:
        Category  Rating  Rating Count Installs  Free  Price  Size  \
0      Adventure     0.0           0.0      10+  True    0.0   10M   
1          Tools     4.4          64.0   5,000+  True    0.0  2.9M   
2   Productivity     0.0           0.0      50+  True    0.0  3.7M   
3  Communication     5.0           5.0      10+  True    0.0  1.8M   
4          Tools     0.0           0.0     100+  True    0.0  6.2M   

  Minimum Android Content Rating  Ad Supported  In App Purchases  \
0      7.1 and up       Everyone         False             False   
1      5.0 and up       Everyone          True             False   
2    4.0.3 and up       Everyone         False             False   
3    4.0.3 and up       Everyone          True             False   
4      4.1 and up       Everyone         False             False   

   Editors Choice  
0           False  
1           False  
2           False  
3           False  
4           False  
<class 'pandas.core.frame.DataFram

In [4]:

# Avaliar modelos e armazenar no dicionário
models['Linear Regression'] = evaluate_model('Linear Regression', LinearRegression(), X_train, y_train, X_test, y_test)

Model: Linear Regression
R^2: 0.7605
MAE: 122656.3307
MSE: 1608324472393.4888
RMSE: 1268197.3318



In [5]:

models['Ridge Regression'] = evaluate_model('Ridge Regression', Ridge(alpha=1.0), X_train, y_train, X_test, y_test)

Model: Ridge Regression
R^2: 0.7604
MAE: 122947.1963
MSE: 1608926612438.5813
RMSE: 1268434.7096



In [6]:

models['Lasso Regression'] = evaluate_model('Lasso Regression', Lasso(alpha=1.0), X_train, y_train, X_test, y_test)

Model: Lasso Regression
R^2: 0.7605
MAE: 122655.1847
MSE: 1608326282876.2549
RMSE: 1268198.0456



In [8]:

models['SVR (linear kernel)'] = evaluate_model('SVR (linear kernel)', SVR(kernel='linear'), X_train, y_train, X_test, y_test)

Model: SVR (linear kernel)
R^2: 0.6397
MAE: 100374.2236
MSE: 2419726810769.4692
RMSE: 1555547.1098



In [9]:

models['SVR (rbf kernel)'] = evaluate_model('SVR (rbf kernel)', SVR(kernel='rbf'), X_train, y_train, X_test, y_test)

Model: SVR (rbf kernel)
R^2: -0.0032
MAE: 146634.7710
MSE: 6736969995509.6279
RMSE: 2595567.3745



In [10]:

models['K-NN'] = evaluate_model('K-NN', KNeighborsRegressor(n_neighbors=2), X_train, y_train, X_test, y_test)

Model: K-NN
R^2: 0.6011
MAE: 112057.1084
MSE: 2678711175340.8379
RMSE: 1636676.8696



In [11]:

models['Decision Tree'] = evaluate_model('Decision Tree', DecisionTreeRegressor(), X_train, y_train, X_test, y_test)

Model: Decision Tree
R^2: 0.1738
MAE: 141167.3984
MSE: 5548672128032.6436
RMSE: 2355561.9559



In [13]:

models['Random Forest'] = evaluate_model('Random Forest', RandomForestRegressor(n_estimators=200), X_train, y_train, X_test, y_test)

Model: Random Forest
R^2: 0.5911
MAE: 117318.8304
MSE: 2745695534807.4575
RMSE: 1657014.0418



In [14]:

models['Neural Network (single layer)'] = evaluate_model('Neural Network (single layer)', MLPRegressor(hidden_layer_sizes=(10000,), max_iter=10000), X_train, y_train, X_test, y_test)

Model: Neural Network (single layer)
R^2: 0.7561
MAE: 95758.5170
MSE: 1638270207780.2100
RMSE: 1279949.2989



In [15]:

models['Neural Network (multi layer)'] = evaluate_model('Neural Network (multi layer)', MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=10000), X_train, y_train, X_test, y_test)

Model: Neural Network (multi layer)
R^2: 0.7587
MAE: 95983.7586
MSE: 1620665511536.3413
RMSE: 1273053.6169



In [16]:
'''

def preprocess_new_app(new_app):
    # Verificar se new_app é um DataFrame, se não, converter para DataFrame
    if not isinstance(new_app, pd.DataFrame):
        new_app = pd.DataFrame([new_app], columns=X.columns)

    # Preencher valores ausentes na nova aplicação
    handle_missing_values(new_app)

    # Tratar a coluna 'Size'
    new_app['Size'] = new_app['Size'].apply(size_to_mb)

    # Tratar a coluna 'Minimum Android'
    new_app['Minimum Android'] = new_app['Minimum Android'].apply(parse_android_version)
    new_app['Minimum Android'] = [int(x * 10) / 10 for x in new_app['Minimum Android']]

    # Codificar variáveis categóricas
    new_app['Category'] = new_app['Category'].apply(lambda x: category_mapping.get(x, -1))
    new_app['Content Rating'] = new_app['Content Rating'].apply(lambda x: content_mapping.get(x, -1))

    # Verificar se há valores não reconhecidos e substituí-los por uma categoria válida (opcional)
    new_app['Category'] = new_app['Category'].replace(-1, category_mapping['Tools'])
    new_app['Content Rating'] = new_app['Content Rating'].replace(-1, content_mapping['Everyone'])

    # Aplicar PCA para redução de dimensionalidade
    new_app_reduced = pca.transform(new_app)

    return new_app_reduced


def predict_installs(new_app_data):
    # Pré-processar os dados da nova aplicação
    new_app_processed = preprocess_new_app(new_app_data)

    # Previsões para cada modelo
    predictions = {}
    for model_name, model in models.items():
        prediction = model.predict(new_app_processed)
        predictions[model_name] = prediction[0]

    # Imprimir previsões
    for model_name, prediction in predictions.items():
        print(f"{model_name}: {prediction:.0f} installs")


# Exemplo de dados de uma nova aplicação
new_app_example = {
    'Category': 33,  # Exemplo de categoria
    'Rating': 4.3,  # Exemplo de avaliação
    'Rating Count': 241000,  # Exemplo de contagem de avaliações
    'Free': True,  # Se a aplicação é gratuita ou não
    'Price': 0.0,  # Preço da aplicação
    'Size': 3,  # Tamanho da aplicação
    'Minimum Android': 10,  # Versão mínima do Android
    'Content Rating': 0,  # Classificação de conteúdo
    'Ad Supported': False,  # Se a aplicação tem suporte a anúncios
    'In App Purchases': True,  # Se a aplicação tem compras no aplicativo
    'Editors Choice': True  # Se a aplicação é escolha do editor
}

# Converter exemplo de nova aplicação para DataFrame
new_app_df = pd.DataFrame([new_app_example])

# Prever o número de instalações para a nova aplicação
predict_installs(new_app_df)'''
import pandas as pd
#Carregar o ficheiro CSV da nova app
new_app_df = pd.read_csv('new_app_example.csv') 


#Remover colunas não presentes nos dados de treino
new_app_df = new_app_df[X.columns]

#Função para prever a variável alvo para uma nova app
def predict_new_app(model, new_app_df):
    # Prever a variável alvo
    prediction = model.predict(new_app_df)
    return prediction

#Fazer a previsão utilizando o pipeline treinado anteriormente

for name, model in models.items():
    prediction = predict_new_app(model, new_app_df)
    print(f"Previsão de 'Installs' com {name} para a nova app 'Gemini': {prediction[0]:.0f}")
#Imprimir tamanho do dataframe
print("Tamanho atual do dataframe:", data.shape)

Previsão de 'Installs' com Linear Regression para a nova app 'Gemini': 10572820
Previsão de 'Installs' com Ridge Regression para a nova app 'Gemini': 11276003
Previsão de 'Installs' com Lasso Regression para a nova app 'Gemini': 10574952
Previsão de 'Installs' com Logistic Regression para a nova app 'Gemini': 10000000
Previsão de 'Installs' com SVR (linear kernel) para a nova app 'Gemini': 8219700
Previsão de 'Installs' com SVR (rbf kernel) para a nova app 'Gemini': 578
Previsão de 'Installs' com K-NN para a nova app 'Gemini': 7500000
Previsão de 'Installs' com Decision Tree para a nova app 'Gemini': 10000000
Previsão de 'Installs' com K_Means para a nova app 'Gemini': 2
Previsão de 'Installs' com Random Forest para a nova app 'Gemini': 7420000
Previsão de 'Installs' com Neural Network (single layer) para a nova app 'Gemini': 12675174
Previsão de 'Installs' com Neural Network (multi layer) para a nova app 'Gemini': 13108617
Tamanho atual do dataframe: (7999, 12)
