In [9]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import json,re
from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor
from preparando_datos import sum_into_column,split_and_sum,get_min_max,compute_average
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

from sklearn.base import BaseEstimator, TransformerMixin
from joblib import Parallel, delayed




df_train=pd.read_csv('dataset/origen.csv')
#take a 10% sample out of train
df_test =  df_train.sample(frac=0.1, random_state=42)
#drop the sample from train
df_train.drop(df_test.index, inplace=True)

X_test = df_test.drop(["averageRating",'Unnamed: 0'], axis=1)
X = df_train.drop(["averageRating",'Unnamed: 0'], axis=1)
y = df_train["averageRating"]
y_test = df_test['averageRating']

del df_test,df_train



In [10]:
X['runtimeMinutes'].replace(0, np.nan, inplace=True)
X['budget'].replace(0, np.nan, inplace=True)
X['revenue'].replace(0, np.nan, inplace=True)

print(X.info())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 879787 entries, 0 to 977540
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   numVotes              879787 non-null  int64  
 1   titleType             879787 non-null  object 
 2   isAdult               879787 non-null  float64
 3   startYear             879787 non-null  int64  
 4   endYear               879787 non-null  int64  
 5   runtimeMinutes        647118 non-null  float64
 6   genres_x              879785 non-null  object 
 7   directors             879787 non-null  object 
 8   writers               879787 non-null  object 
 9   seasonNumber          394234 non-null  float64
 10  episodeNumber         394234 non-null  float64
 11  ordering              333665 non-null  float64
 12  language              333665 non-null  object 
 13  attributes            333665 non-null  object 
 14  isOriginalTitle       333665 non-null  float64
 15  

In [11]:

categorical_columns_with_nan = [
    c for c in X.columns if X[c].dtype == 'O' and X[c].isnull().any()]
categorical_columns_without_nan = [
    c for c in X.columns if X[c].dtype == 'O' and X[c].notnull().all()]

discrete_columns_with_nan = [
    c for c in X.columns if X[c].dtype != 'O' and X[c].isnull().any()]
discrete_columns_without_nan = [
    c for c in X.columns if X[c].dtype != 'O' and X[c].notnull().all()]


numerical_columns_with_nan = [
    c for c in X.columns if X[c].dtype != 'O' and X[c].isnull().any()]
numerical_columns_without_nan = [
    c for c in X.columns if X[c].dtype != 'O' and X[c].notnull().all()]

In [12]:
from feature_engine.imputation import AddMissingIndicator, CategoricalImputer, MeanMedianImputer
from sklearn.preprocessing import FunctionTransformer
def interpolate_numericals(df, group, category, method='linear'):
    # Fill in missing values using interpolation
    df[category].replace(0, np.nan, inplace=True)
    df[category].interpolate(method=method, inplace=True)
    
    # Store the fitted values in a dictionary
    if group not in interpolate_numericals.fitted_values:
        interpolate_numericals.fitted_values[group] = {}
    interpolate_numericals.fitted_values[group][category] = df[category].copy()
    
    return df

# Initialize the fitted values dictionary
interpolate_numericals.fitted_values = {}

pipeline_preprocessing = Pipeline([('numericalMissingIndicator',AddMissingIndicator(variables=numerical_columns_with_nan)),
                                   ('runtimeTransform',FunctionTransformer(lambda x: interpolate_numericals(x,'titleType','runtimeMinutes'))),
                                   ('budgetTransform',FunctionTransformer(lambda x: interpolate_numericals(x,'titleType','budget'))),
                                   ('revenueTransform',FunctionTransformer(lambda x: interpolate_numericals(x,'titleType','revenue'))),
                                   ])


In [13]:
class GetExpDict(BaseEstimator, TransformerMixin):
    def __init__(self, group:list, categories:list,targets:list=['']):
        if type(group) == str:
            group = [group]
        if type(categories) == str:
            categories = [categories]
        if type(targets) == str:
            targets = [targets]
        
        self.group = group
        self.categories = categories
        self.targets = targets
    
    def fit(self, X, y=None):
        grupos = []
        for group in self.group:
            for category in self.categories:
                for target in self.targets:
                    categorias = X[category].unique().tolist()
                    categorias = [x.split(',') if type(x) == str else [] for x in categorias]
                    categorias = [item for sublist in categorias for item in sublist]
                    categorias = list(set(categorias))
                    grupo = X.groupby(group).apply(lambda x: [(g, v) for g, v in zip(x[category], x[target] if target!= 'exp' else [1]*len(x))]).to_dict()
                    if '0' in grupo:
                        del grupo['0']
                    grupos.append((group,grupo,categorias,target))
        self.diccionarios = Parallel(n_jobs=-1)(delayed(self._fit_group)(group, grupo, categorias,target) for group, grupo, categorias,target in grupos)
        return self
    
    def _fit_group(self, group, grupo, categorias,target):
        dict_limpio = {}
        for sub_grupo, shows in grupo.items():
            for dir in sub_grupo.split(','):
                dict_limpio.setdefault(dir, []).extend(shows)

        authors_xp = {}
        for sub_grupo, values in dict_limpio.items():
            cat_counts = {cat: [0, 0] for cat in categorias}
            for cat, count in ((v[0], v[1]) for v in values):

                if cat in cat_counts:
                    cat_counts[cat][0] += count
                    cat_counts[cat][1] += 1
            
            authors_xp[sub_grupo] = {cat: compute_average(cat_counts[cat]) for cat in categorias}

        return (group, authors_xp, categorias,target)
    
    
    def transform(self, X):
        new_columns = {}
        for group, diccionario, categorias, target in self.diccionarios:
            for category in categorias:
                new_column = f'{group}_{target}_{category}'
                new_columns[new_column] = X[group].apply(sum_into_column, args=(diccionario, category,))     
        X = pd.concat([X, pd.DataFrame(new_columns)], axis=1)
        return X


pipeline_exp = Pipeline([
    ('set_exp_dict_genres', GetExpDict(group=['directors',"writers"], categories=['genres_x'],targets=['numVotes','exp'])),
    # add other steps here
])


In [14]:
pipeline = Pipeline([('preprocessing',pipeline_preprocessing),('exp',pipeline_exp)])

In [15]:
pipeline.fit(X,y)

In [None]:
X= pipeline.transform(X)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBRegressor
import cupy as cp
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split



In [None]:
regr = XGBRegressor(n_estimators=1100,eta=0.025,
                        max_depth=19,gamma = 0.20,
                        colsample_bytree = 0.7,colsample_bylevel=0.7,colsample_bynode=0.8,
                        tree_method = 'hist',
                        max_cached_hist_node=262144) # type: ignore
#regr = XGBRegressor(n_estimators=1200, eta=0.35, max_depth=7, multi_strategy="multi_output_tree", min_child_weight=1, subsample=1, colsample_bytree=1, gamma=0, alpha=0)
print("Training a XGBRegressor")
regr.fit(X, y)
print("Finished training the XGBRegressor")

score = regr.score(X_test, y_test)

print(f"R^2 score on testing data: {score:.4f}") 
# RMSE
from sklearn.metrics import mean_squared_error
X_test = pipeline.transform(X_test)
y_pred = regr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))
#print(set(X_pred.columns))