In [5]:
from ast import literal_eval

import pandas as pd
import numpy as np
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import funciones_auxiliares as fa


In [6]:
rows = []
with open("../dataset/steam_games.json") as f:
    for line in f.readlines():
        rows.append(literal_eval(line))

df = pd.DataFrame(rows)

In [7]:
df.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'discount_price', 'reviews_url', 'specs', 'price',
       'early_access', 'id', 'developer', 'sentiment', 'metascore'],
      dtype='object')

In [8]:
# me quedo con las columnas que me interasan para le modelo
df_predict = df[["genres","early_access","price"]]
df_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32135 entries, 0 to 32134
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        28852 non-null  object
 1   early_access  32135 non-null  bool  
 2   price         30758 non-null  object
dtypes: bool(1), object(2)
memory usage: 533.6+ KB


In [9]:
# viendo los nulos
print("Generos nulos:",df_predict["genres"].isna().sum())
print("Early access nulos",df_predict["early_access"].isna().sum())
print("Precios nulos",df_predict["price"].isna().sum())

Generos nulos: 3283
Early access nulos 0
Precios nulos 1377


In [10]:
generos = list(df_predict["genres"])
generos =  fa.aplanar_lista(generos)
generos_set = set(generos)
print(f"Hay {len(generos_set)} generos distintos")

Hay 23 generos distintos


In [11]:
# Dejamos en 0 los valores que contengan un string en la columna precio
df_predict["price"] = df_predict["price"].apply(lambda x: 0 if isinstance(x, str) else x)

df_predict.dropna(subset=["price"],inplace=True)


#df_predict['price'] = df_predict["price"].apply(lambda x: float(x) if isinstance(x, (int, float)) else float('nan'))
#df_predict["price"].astype("float")
#df_predict["price"].dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_predict["price"] = df_predict["price"].apply(lambda x: 0 if isinstance(x, str) else x)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_predict.dropna(subset=["price"],inplace=True)


In [12]:
# Rellenamos los nulos de la columna con el promedio.
precio_medio = df_predict["price"].mean()
df_predict["price"].fillna(precio_medio,inplace=True)
df_predict["price"].describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_predict["price"].fillna(precio_medio,inplace=True)


count    30758.000000
mean         8.866855
std         15.903457
min          0.000000
25%          2.990000
50%          4.990000
75%          9.990000
max        995.000000
Name: price, dtype: float64

In [13]:
# Debido a que el estandar en la industria el precio maximo de un videojuego es de 60 dolares, descartamos los valores por encima de este
filtro_outliers = df_predict["price"] <= 60
df_predict = df_predict[filtro_outliers]

In [14]:
# Transformación del género utilizando one-hot enconder
generos_dummies = df_predict["genres"].str.join(",").str.get_dummies(sep=",")
df_predict= pd.concat([df_predict, generos_dummies], axis=1)

In [15]:
# Eliminamos columnas innecesarias
df_predict.drop(columns=["genres"],inplace=True)
df_predict.drop(columns=["Early Access"],inplace=True)
df_predict.columns

Index(['early_access', 'price', 'Accounting', 'Action', 'Adventure',
       'Animation &amp; Modeling', 'Audio Production', 'Casual',
       'Design &amp; Illustration', 'Education', 'Free to Play', 'Indie',
       'Massively Multiplayer', 'Photo Editing', 'RPG', 'Racing', 'Simulation',
       'Software Training', 'Sports', 'Strategy', 'Utilities',
       'Video Production', 'Web Publishing'],
      dtype='object')

In [16]:
# Corregimos nombres de columnas
df_predict.rename(columns={"Animation &amp; Modeling":"Animation and Modeling","Design &amp; Illustration":"Design and Illustration"}, inplace=True)
df_predict.columns

Index(['early_access', 'price', 'Accounting', 'Action', 'Adventure',
       'Animation and Modeling', 'Audio Production', 'Casual',
       'Design and Illustration', 'Education', 'Free to Play', 'Indie',
       'Massively Multiplayer', 'Photo Editing', 'RPG', 'Racing', 'Simulation',
       'Software Training', 'Sports', 'Strategy', 'Utilities',
       'Video Production', 'Web Publishing'],
      dtype='object')

In [17]:
X = df_predict.drop(columns=['price'])
y = df_predict["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Instanciamos el modelo
model = LinearRegression()

In [19]:
# Entrenamos el modelo
model.fit(X_train,y_train)
y_pred = model.predict(X_test)


In [20]:
# Calculamos el error cuadratico medio para medir nuestro modelo
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse) 
# 8.424571637950317

RMSE: 8.424571637950317


In [21]:
# Exportamos el modelo
joblib.dump(model,"modelo_precio_videojuego.pkl")

['modelo_precio_videojuego.pkl']