In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('steam_games_limpio.csv', encoding='utf-8')
# Convertir la columna "release_date" al tipo datetime
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
# Eliminamos una columna
df.drop('Unnamed: 0', axis=1,inplace=True)

In [4]:
# Paso 1: Reemplazar las palabras por 0 en la columna "price"
df['price'] = df['price'].apply(lambda x: '0' if isinstance(x, str) and not x.replace('.', '').isdigit() else x)

# Paso 2: Convertir la columna "price" al tipo decimal
df['price'] = df['price'].astype(float)

eliminar = ['app_name', 'title', 'url', 'release_date', 'tags', 'discount_price', 'reviews_url', 'specs', 'id', 'developer', 'sentiment', 'metascore', 'publisher','publisher']
df2 = df.drop(eliminar, axis=1)
df2.dropna(inplace=True)
df2

Unnamed: 0,genres,price,early_access
0,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",4.99,False
1,"['Free to Play', 'Indie', 'RPG', 'Strategy']",0.00,False
2,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",0.00,False
3,"['Action', 'Adventure', 'Casual']",0.99,False
5,"['Action', 'Adventure', 'Simulation']",3.99,False
...,...,...,...
32050,"['Action', 'Adventure', 'Casual', 'Indie']",1.99,False
32051,"['Casual', 'Indie', 'Simulation', 'Strategy']",1.99,False
32052,"['Casual', 'Indie', 'Strategy']",4.99,False
32053,"['Indie', 'Racing', 'Simulation']",1.99,False


In [5]:
import ast

# Convertir las listas en columnas de Python (evaluación de literales)
df2['genres'] = df2['genres'].apply(ast.literal_eval)

# Utilizar explode para deshacer las listas
df2_exploded = df2.explode('genres')

# Crear columnas dummy con get_dummies
df_dummies = pd.get_dummies(df2_exploded['genres'])

# Agregar la columna 'price' y 'ACCESO_ANTISIPADO' al DataFrame df_dummies
df_dummies['price'] = df2_exploded['price']
df_dummies['early_access'] = df2_exploded['early_access']

# Agrupar por el índice (género) y sumar las filas agrupadas
df_grouped = df_dummies.groupby(df_dummies.index).sum()

# Agregar las columnas 'price' y 'early_access' después del groupby
df_grouped['price'] = df2_exploded.groupby(df2_exploded.index).first()['price']
df_grouped['early_access'] = df2_exploded.groupby(df2_exploded.index).first()['early_access']

# Reemplazar df2 con el DataFrame final df_grouped
df2 = df_grouped

In [6]:
df2 = df2[df2['price'] != 0.00]
df2 = df2[df2['Free to Play'] != 1]
df2 = df2.drop('Free to Play', axis=1)
df2

Unnamed: 0,Accounting,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,Early Access,Education,Indie,...,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,price,early_access
0,0,1,0,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,0,4.99,False
3,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.99,False
5,0,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,3.99,False
12,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,10.99,False
13,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,3.99,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32050,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1.99,False
32051,0,0,0,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,0,1.99,False
32052,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,4.99,False
32053,0,0,0,0,0,0,0,0,0,1,...,1,1,0,0,0,0,0,0,1.99,False


In [24]:
df2.columns

Index(['Accounting', 'Action', 'Adventure', 'Animation &amp; Modeling',
       'Audio Production', 'Casual', 'Design &amp; Illustration',
       'Early Access', 'Education', 'Indie', 'Massively Multiplayer',
       'Photo Editing', 'RPG', 'Racing', 'Simulation', 'Software Training',
       'Sports', 'Strategy', 'Utilities', 'Video Production', 'Web Publishing',
       'price', 'early_access'],
      dtype='object')

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [57]:
# Supongamos que tienes tus datos en un dataframe llamado 'df2'
# Convertir la columna 'early_access' a tipo entero
df2['early_access'] = df2['early_access'].astype(int)

# Definir las columnas que serán características en el modelo (asegurándote de que todas sean numéricas)
X = df2.drop(columns=['Accounting', 'Action', 'Adventure',
                      'Audio Production',  'Design &amp; Illustration',
                      'Early Access', 'Education', 'Massively Multiplayer',
                      'Photo Editing', 'RPG', 'Racing',  'Software Training',
                      'Sports', 'Strategy', 'Utilities', 'Video Production', 'Web Publishing', 'Indie',
                      'early_access','Animation &amp; Modeling','Casual'], axis=1)

y = df2['price']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Especificar el grado del polinomio deseado
grado_polinomio = 2

# Crea el transformador polinomial
poly = PolynomialFeatures(degree=grado_polinomio)

# Transforma las características de entrenamiento y prueba
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Crea el modelo de regresión lineal
modelo_regresion = LinearRegression()

# Entrena el modelo utilizando las características polinomiales
modelo_regresion.fit(X_train_poly, y_train)

# Realiza predicciones en el conjunto de prueba
y_pred = modelo_regresion.predict(X_test_poly)

# Calcular el Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calcular el Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Filtrar solo los registros donde el género sea igual a 1
genero_1_df = df2[df2['Simulation']== 1]

# Calcular el promedio del precio para esos registros
precio_promedio_genero_1 = genero_1_df['price'].mean()

# Mostrar el RMSE y el promedio del precio para el género 1
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Promedio del precio para el género 1: {round(precio_promedio_genero_1, 2)}")

Root Mean Squared Error (RMSE): 1.0888467154985035e-12
Promedio del precio para el género 1: 9.97


In [56]:
y_pred

array([ 7.99,  0.99,  9.99, ..., 19.99,  3.99,  2.99])