In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import VotingClassifier, VotingRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, KFold

class Gentle:
    def __init__(self):
        self._gentle = []

    def read_data(self, train_file):
        """ 
        Importing the csv files.
        train_file: The csv dataset to be analyzed
        """
        try:
            data = pd.read_csv(train_file)
            return data
        except:
            raise Exception("Error occured when importing files")

    def model(self, problem_type, data, target):
        """
        To create a model for the dataset.
        data: the imported dataset
        problem_type: problem type - classification or regression.
            for classification type 'class'
            for regression, type 'reg'
        target: the name of the target column in the dataset
        """
        try:
            # Splitting Target and Dependent variables
            X = data.drop(target, axis=1)
            y = data[target]
            
            # Splitting the datasets
            trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.3, random_state=101)
            
            # CLASSIFICATION
            if problem_type == 'class':
                models = [('cat', CatBoostClassifier(verbose=0)),
                          ('lgbm', LGBMClassifier()),
                          ('xgb', XGBClassifier())]
                metric = 'roc_auc'
                voting = 'soft'
                kfold = StratifiedKFold(n_splits=5, shuffle=True)
                scoring = ['roc_auc', 'balanced_accuracy']
            
            # REGRESSION
            elif problem_type == 'reg':
                models = [('cat', CatBoostRegressor(verbose=0)),
                          ('lgbm', LGBMRegressor()),
                          ('xgb', XGBRegressor())]
                metric = 'RMSE'
                voting = 'hard'
                kfold = KFold(n_splits=5, shuffle=True)
                scoring = ['neg_root_mean_squared_error', 'r2']
                
            results = []
            names = []
            for name, model in models:
                cv_results = cross_validate(model, trainX, trainy, cv=kfold, scoring=scoring, return_train_score=True)
                results.append([-cv_results['test_neg_root_mean_squared_error'].mean(), cv_results[f'test_{metric}'].mean()])
                names.append(name)
            
            alg = pd.DataFrame({'models': models, metric: [m[1] for m in results]})
            top_models = alg.sort_values(metric)[:2]['models'].tolist()
            top_weights = alg.sort_values(metric)[:2][metric].tolist()
            ensemble = VotingClassifier(top_models, weights=top_weights, voting=voting) if problem_type == 'class' else VotingRegressor(top_models, weights=top_weights, voting=voting)
            ensemble.fit(trainX, trainy)
            y_pred = ensemble.predict(testX)
            train_score = ensemble.score(trainX, trainy)
            test_score = ensemble.score(testX, testy)

            scores = f'Ensemble: The Train score is {train_score}, The Test score is {test_score}'
            return alg, scores
            
        except:
            raise Exception(f"Error occured when modeling for {problem_type}")


ModuleNotFoundError: No module named 'catboost'