# Se importa las librerías necesarias para hacer el modelo de predicción

In [398]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from ast import literal_eval
from sklearn.preprocessing import OneHotEncoder
import joblib
from category_encoders import BinaryEncoder

# Se procede a limpiar los datos para crear un modelo de regresion lineal

In [399]:
rows = []
with open("Data/steam_games.json") as f:
    for line in f.readlines():
        rows.append(literal_eval(line))

df = pd.DataFrame(rows)

In [400]:
df.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'discount_price', 'reviews_url', 'specs', 'price',
       'early_access', 'id', 'developer', 'sentiment', 'metascore'],
      dtype='object')

In [401]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32135 entries, 0 to 32134
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   publisher       24083 non-null  object 
 1   genres          28852 non-null  object 
 2   app_name        32133 non-null  object 
 3   title           30085 non-null  object 
 4   url             32135 non-null  object 
 5   release_date    30068 non-null  object 
 6   tags            31972 non-null  object 
 7   discount_price  225 non-null    float64
 8   reviews_url     32133 non-null  object 
 9   specs           31465 non-null  object 
 10  price           30758 non-null  object 
 11  early_access    32135 non-null  bool   
 12  id              32133 non-null  object 
 13  developer       28836 non-null  object 
 14  sentiment       24953 non-null  object 
 15  metascore       2677 non-null   object 
dtypes: bool(1), float64(1), object(14)
memory usage: 3.7+ MB


In [402]:
df2 = pd.read_csv('steam_games.csv')

# Como primer paso, se ve la correlación entre variables

In [403]:
df["price"] = pd.to_numeric(df["price"], errors='coerce').fillna(0)

Convertir columnas no numéricas a valores numéricos

In [404]:
for column in df.columns:
    if df2[column].dtype == 'object':
        df2[column] = pd.factorize(df2[column])[0]

Calcular la correlación entre las variables

In [405]:
correlation_matrix = df2.corr()

Ver la correlación con respecto a la columna 'price'

In [406]:
correlation_with_price = correlation_matrix['price'].drop('price')
correlation_with_price

publisher        -0.026260
genres            0.058545
app_name          0.005265
title             0.005265
url               0.005134
release_date      0.016772
tags              0.034069
discount_price   -0.310974
reviews_url       0.005042
specs             0.027696
early_access      0.018160
id                0.005083
developer        -0.026611
sentiment         0.002937
metascore         0.100500
Year              0.037676
Name: price, dtype: float64

Se crea un nuevo dataframe que contenga las columnas que se consideran mas relevantes para el modelo y la columna objetivo, en el readme se explica bien el porqué

In [407]:
df_modelo = df[['genres', 'specs', 'price']].copy()
df_modelo

Unnamed: 0,genres,specs,price
0,"[Action, Casual, Indie, Simulation, Strategy]",[Single-player],4.99
1,"[Free to Play, Indie, RPG, Strategy]","[Single-player, Multi-player, Online Multi-Pla...",0.00
2,"[Casual, Free to Play, Indie, Simulation, Sports]","[Single-player, Multi-player, Online Multi-Pla...",0.00
3,"[Action, Adventure, Casual]",[Single-player],0.99
4,,"[Single-player, Full controller support, HTC V...",2.99
...,...,...,...
32130,"[Casual, Indie, Simulation, Strategy]","[Single-player, Steam Achievements]",1.99
32131,"[Casual, Indie, Strategy]","[Single-player, Steam Achievements, Steam Clou...",4.99
32132,"[Indie, Racing, Simulation]","[Single-player, Steam Achievements, Steam Trad...",1.99
32133,"[Casual, Indie]","[Single-player, Steam Achievements, Steam Cloud]",4.99


In [408]:
#df_modelo.dropna(subset=["price"],inplace=True)

In [409]:
#df_modelo["price"] = df_modelo["price"].apply(lambda x: 0 if isinstance(x, str) else x)

In [410]:
df_modelo["price"].astype("float")

0        4.99
1        0.00
2        0.00
3        0.99
4        2.99
         ... 
32130    1.99
32131    4.99
32132    1.99
32133    4.99
32134    4.99
Name: price, Length: 32135, dtype: float64

Se elimina las filas donde 'specs' tiene valores nulos ya que no se considera importante

In [411]:
df_modelo = df_modelo.dropna(subset=['price'])

In [412]:
df_modelo = df_modelo.dropna(subset=['genres'])

In [413]:
df_modelo = df_modelo.dropna(subset=['specs'])

In [414]:
df_modelo

Unnamed: 0,genres,specs,price
0,"[Action, Casual, Indie, Simulation, Strategy]",[Single-player],4.99
1,"[Free to Play, Indie, RPG, Strategy]","[Single-player, Multi-player, Online Multi-Pla...",0.00
2,"[Casual, Free to Play, Indie, Simulation, Sports]","[Single-player, Multi-player, Online Multi-Pla...",0.00
3,"[Action, Adventure, Casual]",[Single-player],0.99
5,"[Action, Adventure, Simulation]","[Single-player, Steam Achievements]",3.99
...,...,...,...
32129,"[Action, Adventure, Casual, Indie]","[Single-player, Steam Achievements, Steam Cloud]",1.99
32130,"[Casual, Indie, Simulation, Strategy]","[Single-player, Steam Achievements]",1.99
32131,"[Casual, Indie, Strategy]","[Single-player, Steam Achievements, Steam Clou...",4.99
32132,"[Indie, Racing, Simulation]","[Single-player, Steam Achievements, Steam Trad...",1.99


In [415]:
df_modelo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28566 entries, 0 to 32133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   genres  28566 non-null  object 
 1   specs   28566 non-null  object 
 2   price   28566 non-null  float64
dtypes: float64(1), object(2)
memory usage: 892.7+ KB


# Se crea el modelo

Se desconcatena los datos de las filas en las columnas creando columnas booleanas a partir de los mismos

In [416]:
df_modelo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28566 entries, 0 to 32133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   genres  28566 non-null  object 
 1   specs   28566 non-null  object 
 2   price   28566 non-null  float64
dtypes: float64(1), object(2)
memory usage: 892.7+ KB


In [417]:
generos_concatenados = df_modelo["genres"].str.join(",").str.get_dummies(sep=",")
df_modelo= pd.concat([df_modelo, generos_concatenados], axis=1)
df_modelo.drop(columns=["genres"],inplace=True)

In [418]:
generos_concatenados.columns

Index(['Action', 'Adventure', 'Animation &amp; Modeling', 'Audio Production',
       'Casual', 'Design &amp; Illustration', 'Early Access', 'Education',
       'Free to Play', 'Indie', 'Massively Multiplayer', 'Photo Editing',
       'RPG', 'Racing', 'Simulation', 'Software Training', 'Sports',
       'Strategy', 'Utilities', 'Video Production', 'Web Publishing'],
      dtype='object')

In [419]:
#etiquetas_concatenadas = df_modelo["tags"].str.join(",").str.get_dummies(sep=",")
#df_modelo= pd.concat([df_modelo, etiquetas_concatenadas], axis=1)
#df_modelo.drop(columns=["tags"],inplace=True)

In [420]:
#etiquetas_concatenadas.columns

In [421]:
especificaciones_concatenadas = df_modelo["specs"].str.join(",").str.get_dummies(sep=",")
df_modelo= pd.concat([df_modelo, especificaciones_concatenadas], axis=1)
df_modelo.drop(columns=["specs"],inplace=True)

In [422]:
especificaciones_concatenadas.columns

Index(['Captions available', 'Co-op', 'Commentary available',
       'Cross-Platform Multiplayer', 'Downloadable Content',
       'Full controller support', 'Game demo', 'In-App Purchases',
       'Includes Source SDK', 'Includes level editor', 'Local Co-op',
       'Local Multi-Player', 'MMO', 'Mods', 'Mods (require HL1)',
       'Mods (require HL2)', 'Multi-player', 'Online Co-op',
       'Online Multi-Player', 'Partial Controller Support',
       'Shared/Split Screen', 'Single-player', 'Stats', 'Steam Achievements',
       'Steam Cloud', 'Steam Leaderboards', 'Steam Trading Cards',
       'Steam Turn Notifications', 'Steam Workshop', 'SteamVR Collectibles',
       'Valve Anti-Cheat enabled'],
      dtype='object')

In [423]:
df_modelo.columns

Index(['price', 'Action', 'Adventure', 'Animation &amp; Modeling',
       'Audio Production', 'Casual', 'Design &amp; Illustration',
       'Early Access', 'Education', 'Free to Play', 'Indie',
       'Massively Multiplayer', 'Photo Editing', 'RPG', 'Racing', 'Simulation',
       'Software Training', 'Sports', 'Strategy', 'Utilities',
       'Video Production', 'Web Publishing', 'Captions available', 'Co-op',
       'Commentary available', 'Cross-Platform Multiplayer',
       'Downloadable Content', 'Full controller support', 'Game demo',
       'In-App Purchases', 'Includes Source SDK', 'Includes level editor',
       'Local Co-op', 'Local Multi-Player', 'MMO', 'Mods',
       'Mods (require HL1)', 'Mods (require HL2)', 'Multi-player',
       'Online Co-op', 'Online Multi-Player', 'Partial Controller Support',
       'Shared/Split Screen', 'Single-player', 'Stats', 'Steam Achievements',
       'Steam Cloud', 'Steam Leaderboards', 'Steam Trading Cards',
       'Steam Turn Notifications'

Se crea las variables

In [424]:
X = df_modelo.drop(columns=['price'])
y = df_modelo["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Se instancia el modelo

In [425]:
modelo = LinearRegression()

Se entrena

In [426]:
modelo.fit(X_train,y_train)

In [427]:
y_pred = modelo.predict(X_test)
y_pred

array([ 4.99534949, 26.70333441,  8.84761314, ...,  5.42712871,
        5.28001304,  5.99854275])

Se calcula el R.M.S.E. (Error Cuadratico Medio) del modelo

In [429]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

10.773332764246002

Se crea un archivo .pkl que contenga el modelo

In [428]:
joblib.dump(modelo,"modelo_regresion.pkl")

['modelo_regresion.pkl']