In [1]:
import pandas as pd


In [8]:
df = pd.read_csv('steam_games_limpio.csv', encoding='utf-8')
# Convertir la columna "release_date" al tipo datetime
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
# Eliminamos una columna
df.drop('Unnamed: 0', axis=1,inplace=True)

In [9]:
# Paso 1: Reemplazar las palabras por 0 en la columna "price"
df['price'] = df['price'].apply(lambda x: '0' if isinstance(x, str) and not x.replace('.', '').isdigit() else x)

# Paso 2: Convertir la columna "price" al tipo decimal
df['price'] = df['price'].astype(float)

eliminar = ['app_name', 'title', 'url', 'release_date', 'tags', 'discount_price', 'reviews_url', 'specs', 'id', 'developer', 'sentiment', 'metascore', 'publisher','publisher']
df2 = df.drop(eliminar, axis=1)
df2

Unnamed: 0,genres,price,early_access
0,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",4.99,False
1,"['Free to Play', 'Indie', 'RPG', 'Strategy']",0.00,False
2,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",0.00,False
3,"['Action', 'Adventure', 'Casual']",0.99,False
4,,2.99,False
...,...,...,...
32051,"['Casual', 'Indie', 'Simulation', 'Strategy']",1.99,False
32052,"['Casual', 'Indie', 'Strategy']",4.99,False
32053,"['Indie', 'Racing', 'Simulation']",1.99,False
32054,"['Casual', 'Indie']",4.99,False


In [10]:
df2.dropna(inplace=True)
df2

Unnamed: 0,genres,price,early_access
0,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",4.99,False
1,"['Free to Play', 'Indie', 'RPG', 'Strategy']",0.00,False
2,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",0.00,False
3,"['Action', 'Adventure', 'Casual']",0.99,False
5,"['Action', 'Adventure', 'Simulation']",3.99,False
...,...,...,...
32050,"['Action', 'Adventure', 'Casual', 'Indie']",1.99,False
32051,"['Casual', 'Indie', 'Simulation', 'Strategy']",1.99,False
32052,"['Casual', 'Indie', 'Strategy']",4.99,False
32053,"['Indie', 'Racing', 'Simulation']",1.99,False


In [5]:
df2.rename(columns={'early_access': 'ACCESO_ANTISIPADO'}, inplace=True)

In [11]:
pd.set_option('display.max_columns', None)

In [12]:
import ast

# Convertir las listas en columnas de Python (evaluación de literales)
df2['genres'] = df2['genres'].apply(ast.literal_eval)

# Utilizar explode para deshacer las listas
df2_exploded = df2.explode('genres')

# Crear columnas dummy con get_dummies
df_dummies = pd.get_dummies(df2_exploded['genres'])

# Agregar la columna 'price' y 'ACCESO_ANTISIPADO' al DataFrame df_dummies
df_dummies['price'] = df2_exploded['price']
df_dummies['early_access'] = df2_exploded['early_access']

# Agrupar por el índice (género) y sumar las filas agrupadas
df_grouped = df_dummies.groupby(df_dummies.index).sum()

# Agregar las columnas 'price' y 'early_access' después del groupby
df_grouped['price'] = df2_exploded.groupby(df2_exploded.index).first()['price']
df_grouped['early_access'] = df2_exploded.groupby(df2_exploded.index).first()['early_access']

# Reemplazar df2 con el DataFrame final df_grouped
df2 = df_grouped

df2

Unnamed: 0,Accounting,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,Early Access,Education,Free to Play,Indie,Massively Multiplayer,Photo Editing,RPG,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,price,early_access
0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,4.99,False
1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0.00,False
2,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0.00,False
3,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.99,False
5,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3.99,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32050,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.99,False
32051,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1.99,False
32052,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,4.99,False
32053,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1.99,False


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split

In [None]:
df2.columns

In [14]:
# Separar las variables independientes (X) y la variable objetivo (y)
X = df2.drop(['price'], axis=1)
y = df2['price']

# Dividir el conjunto de datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el modelo de regresión lineal y entrenarlo
model = LinearRegression()
model.fit(X_train, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Calcular el error cuadrático medio (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Imprimir el RMSE
print("RMSE:", rmse)

# Imprimir los coeficientes del modelo (uno por cada columna en X)
print("Coeficientes:", model.coef_)

# Imprimir el intercepto del modelo
print("Intercepto:", model.intercept_)

RMSE: 18.855330407895714
Coeficientes: [ 1.30092344e+01 -6.78142979e-01  1.73547035e-01  2.58453858e+01
  1.53812190e+01 -3.80214898e+00  7.38879840e+00  5.83470856e+12
  2.21421022e+01 -1.42278636e-01 -3.20047128e+00  7.87742221e+00
 -3.34474467e+00  7.91050956e-01 -1.85456578e+00  1.56828727e+00
 -1.86858057e+01  1.65710231e+00  4.32448230e-01  1.97609141e+00
  2.61546178e+01 -3.90388008e-01 -5.83470856e+12]
Intercepto: 10.578959736857769


In [15]:
import pickle

# Guardar el modelo en un archivo pickle
with open('modelo_entrenado.pkl', 'wb') as archivo:
    pickle.dump(model, archivo)