In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [43]:
df = pd.read_csv('../../data/clean/trends-clean.csv')
df

Unnamed: 0,Date,Platform,Daily Active Users (DAU),New Registrations,Session Duration (minutes),In-game Purchases ($),Social Media Mentions,Stream Viewership,Revenue ($),Top Genre,Influencer Endorsements
0,2022-04-13,PC,142204,14220,31.266635,60772.724060,7116,85079.542766,87378.407590,Action,0
1,2021-03-17,PC,132314,13231,23.314436,43499.834450,6582,64188.420576,41132.113751,Action,0
2,2019-03-09,VR,111557,11155,33.859334,52178.791314,5572,34385.665788,71787.342041,Simulation,0
3,2021-03-31,Console,132909,13290,80.580010,25085.562444,6619,43129.915707,18639.641156,RPG,0
4,2021-02-04,Mobile,130694,13069,37.270903,55277.074439,6555,69341.389827,63849.959750,Action,0
...,...,...,...,...,...,...,...,...,...,...,...
4984,2020-11-28,VR,129812,12981,21.665522,89255.466697,2756,22293.209826,149864.021675,RPG,0
4985,2020-11-29,VR,143326,14332,79.158413,94773.819233,8878,97013.445923,50820.429947,RPG,0
4986,2020-11-30,Mobile,37056,3705,38.886782,96692.331823,7357,80238.163032,58049.318759,Adventure,0
4987,2020-12-01,Console,65306,6530,25.189762,66163.588432,3399,56842.064818,97370.539505,FPS,0


In [44]:
null = df.isnull().sum()
print(null)

Date                          0
Platform                      0
Daily Active Users (DAU)      0
New Registrations             0
Session Duration (minutes)    0
In-game Purchases ($)         0
Social Media Mentions         0
Stream Viewership             0
Revenue ($)                   0
Top Genre                     0
Influencer Endorsements       0
dtype: int64


In [45]:
print(df.dtypes)

Date                           object
Platform                       object
Daily Active Users (DAU)        int64
New Registrations               int64
Session Duration (minutes)    float64
In-game Purchases ($)         float64
Social Media Mentions           int64
Stream Viewership             float64
Revenue ($)                   float64
Top Genre                      object
Influencer Endorsements         int64
dtype: object


## Predicción 

In [51]:
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn import tree


In [47]:
# Define las columnas que deseas convertir a números
features = ["Top Genre", "Platform", "Date"]

# Crea y ajusta el codificador ordinal
encoder = OrdinalEncoder(
    handle_unknown='use_encoded_value',  # Maneja valores desconocidos
    unknown_value=-1                    # Los valores desconocidos serán codificados como -1
).fit(df[features])

# Transforma las columnas del dataframe principal
df[features] = encoder.transform(df[features])


# Imprime los primeros resultados para confirmar
print(df[features].head())


   Top Genre  Platform    Date
0        0.0       2.0  4165.0
1        0.0       2.0  4020.0
2        4.0       3.0  3354.0
3        3.0       0.0  4028.0
4        0.0       1.0  4011.0


In [48]:
print(df.dtypes)

Date                          float64
Platform                      float64
Daily Active Users (DAU)        int64
New Registrations               int64
Session Duration (minutes)    float64
In-game Purchases ($)         float64
Social Media Mentions           int64
Stream Viewership             float64
Revenue ($)                   float64
Top Genre                     float64
Influencer Endorsements         int64
dtype: object


In [53]:
X = df.drop('Revenue ($)', axis=1)  
y = df['Revenue ($)']  

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

models = {
    # 'Linear Regression': LinearRegression(),
    # 'Lasso Regression': Lasso(alpha=0.1),
    # 'Ridge Regression': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Decision Tree': tree.DecisionTreeRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"\n========== {name} ==========")
    print(f'Mean Squared Error (MSE): {mse}')
    print(f'R^2: {r2}')
    print(f'Mean Absolute Error (MAE): {mae}')


Mean Squared Error (MSE): 1559397652.0621288
R^2: 0.12483940168160668
Mean Absolute Error (MAE): 32312.79873222028

Mean Squared Error (MSE): 1552521922.1943724
R^2: 0.1286981787273358
Mean Absolute Error (MAE): 32944.35790013002

Mean Squared Error (MSE): 3118400814.658313
R^2: -0.7500998024102465
Mean Absolute Error (MAE): 43003.824825095886
