##Importación de librerías

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score

ModuleNotFoundError: No module named 'pandas'

##Carga y exploración incial del dataset

In [3]:
# Cargar dataset
df = pd.read_csv("merged_dataset.csv")

# Exploración inicial
display(df.head())
print(df.info())
print(df.describe())

Unnamed: 0,name,year,movie_rated,run_length,genres,release_date,rating,num_raters,num_reviews
0,Inception,2010,PG-13,2h 28min,Action; Adventure; Sci-Fi;,16 July 2010 (USA),8.8,1981675,3820
1,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,2h 58min,Action; Adventure; Drama;,19 December 2001 (USA),8.8,1609165,5365
2,The Lord of the Rings: The Return of the King,2003,PG-13,3h 21min,Adventure; Drama; Fantasy;,17 December 2003 (USA),8.9,1593859,3681
3,The Dark Knight Rises,2012,PG-13,2h 44min,Action; Adventure;,20 July 2012 (USA),8.4,1470329,2979
4,The Lord of the Rings: The Two Towers,2002,PG-13,2h 59min,Adventure; Drama; Fantasy;,18 December 2002 (USA),8.7,1440188,2559


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          1500 non-null   object 
 1   year          1500 non-null   int64  
 2   movie_rated   1500 non-null   object 
 3   run_length    1500 non-null   object 
 4   genres        1500 non-null   object 
 5   release_date  1500 non-null   object 
 6   rating        1500 non-null   float64
 7   num_raters    1500 non-null   int64  
 8   num_reviews   1500 non-null   int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 105.6+ KB
None
              year       rating    num_raters   num_reviews
count  1500.000000  1500.000000  1.500000e+03   1500.000000
mean   2002.632000     7.477600  4.116746e+05    996.054667
std      14.758516     0.725035  3.181686e+05   1009.968589
min    1915.000000     3.500000  1.929000e+04    102.000000
25%    1998.000000     7.000000  1.912445e+05    424

##Limpieza/modificación de datos

In [4]:
#Eliminación de columnas que no vamos a utilizar
df = df.drop(['run_length', 'release_date', 'num_raters', 'num_reviews'], axis=1)

#Renombramiento de columnas (para mayor facilidad)
df = df.rename(columns={'name': 'Movie Title',
'year': 'Release Year',
'movie_rated': 'Audience',
'genres': 'Film Type',
'rating': 'IMDb Rating'})

#Eliminar Filas duplicadas
df = df.drop_duplicates(subset =['Movie Title'])

df.head()

Unnamed: 0,Movie Title,Release Year,Audience,Film Type,IMDb Rating
0,Inception,2010,PG-13,Action; Adventure; Sci-Fi;,8.8
1,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,Action; Adventure; Drama;,8.8
2,The Lord of the Rings: The Return of the King,2003,PG-13,Adventure; Drama; Fantasy;,8.9
3,The Dark Knight Rises,2012,PG-13,Action; Adventure;,8.4
4,The Lord of the Rings: The Two Towers,2002,PG-13,Adventure; Drama; Fantasy;,8.7


In [5]:
features = ['Release Year', 'Audience', 'Film Type']
X = df[features]
y = df['IMDb Rating']

# Manejo de valores nulos
X = X.fillna(X.mode().iloc[0])  # Rellenamos con el valor más frecuente
y = y.fillna(y.median())


In [6]:
# Codificación de variables categóricas
categorical_features = ['Audience', 'Film Type']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, categorical_features)], remainder='passthrough')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Modelo XGBoost con hiperparámetros
gxboost = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

grid_search = GridSearchCV(estimator=gxboost, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

In [8]:
# Crear pipeline completo
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', grid_search)])

# Entrenamiento
pipeline.fit(X_train, y_train)

# Predicción
y_pred = pipeline.predict(X_test)

# Evaluación
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f'RMSE: {rmse}')
print(f'R²: {r2}')

# Guardar el modelo
joblib.dump(pipeline, "imdb_xgboost_model.pkl")
print("Modelo guardado exitosamente.")

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=200; total time=   0.1s
[CV] END ..learning_rate=0.01, max_depth=5, n_est