In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier, SGDRegressor, Lasso, ElasticNet
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingClassifier, VotingRegressor, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from xgboost.sklearn import XGBClassifier, XGBRegressor
# from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_validate, StratifiedKFold
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, RobustScaler
import category_encoders as ce
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.decomposition import PCA
import warnings

warnings.filterwarnings('ignore')

class FullCourse:
    def __init__(self, file, target):
        self.file = file
        self.target = target

    def load_dataframe(self):
        TFrame = pd.read_csv(self.file, encoding='cp1252')
        return TFrame

    def remove_columns_missing_values(self, TFrame):
        crazy = [i for i in TFrame.columns if TFrame[i].isna().sum() >= 0.7 * len(TFrame[i])]
        TFrame = TFrame.drop(crazy, axis=1)
        return TFrame

    def sort_datatypes(self, TFrame):
        date = [d for d in TFrame.columns if 'Date' in d]
        cat = [n for n in TFrame.columns if TFrame[n].dtype == object and n not in date]
        num = [p for p in TFrame.columns if p not in cat]
        return date, cat, num

    def fill_missing_values(self, TFrame, num_cols, cat_cols):
        imp = KNNImputer(n_neighbors=5)
        missing_num = [i for i in num_cols if TFrame[i].isna().sum() != 0]
        if missing_num:
            TFrame[missing_num] = imp.fit_transform(TFrame[missing_num])

        obj = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        missing_cat = [i for i in cat_cols if TFrame[i].isna().sum() != 0]
        if missing_cat:
            TFrame[missing_cat] = obj.fit_transform(TFrame[missing_cat])
            TFrame[cat_cols] = TFrame[cat_cols].astype('category')

        return TFrame

    def preprocess_date_features(self, TFrame, date_cols):
        if date_cols:
            obj = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
            TFrame[date_cols] = obj.fit_transform(TFrame[date_cols])

            for i in date_cols:
                TFrame[i] = pd.to_datetime(TFrame[i]).astype('category')
                TFrame[f'{i}_months'] = TFrame[i].dt.month.astype('category')
                TFrame[f'{i}_days'] = TFrame[i].dt.day.astype('category')
                TFrame[f'{i}_years'] = TFrame[i].dt.year.astype('category')

            TFrame = TFrame.drop(date_cols, axis=1)

        return TFrame

    def remove_columns_high_variance(self, TFrame, cat_cols):
        uniq = [i for i in cat_cols if TFrame[i].nunique() >= 0.7 * TFrame.shape[0]]
        TFrame = TFrame.drop(uniq, axis=1)
        return TFrame

    def split_data(self, X, y):
        trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.2, random_state=101)
        return trainX, testX, trainy, testy

    def encode_categorical_features(self, trainX, testX, t):
        encode = ce.BinaryEncoder(t)
        trainX = encode.fit_transform(trainX)
        testX = encode.transform(testX)
        return trainX, testX

    def resample_data(self, trainX, trainy):
        if np.mean(list(trainX.value_counts())) > 1.5 * min(list(trainX.value_counts())):
            oversample = SMOTE()
            trainX, trainy = oversample.fit_resample(trainX, trainy)
        return trainX, trainy

    def perform_feature_selection(self, trainX, testX):
        if len(trainX.columns) > 20:
            dec = PCA(0.98)
            trainX = dec.fit_transform(trainX)
            testX = dec.transform(testX)
        return trainX, testX

    def standardize_data(self, trainX, testX):
        sc = RobustScaler()
        trainX = sc.fit_transform(trainX)
        testX = sc.transform(testX)
        return trainX, testX

    def evaluate_classification_models(self, models, trainX, trainy):
        results = []
        mods = []
        names = []
        scoring = ['roc_auc', 'f1']
        for name, model in models:
            kfold = StratifiedKFold(n_splits=7, shuffle=True)
            cv_results = cross_validate(model, trainX, trainy, cv=kfold, scoring=scoring, return_train_score=True)
            results.append([cv_results['test_roc_auc'].mean(), cv_results['test_f1'].mean()])
            mods.append(model)
            names.append(name)
        alg = pd.DataFrame({'models': models, 'roc_auc': [m[0] for m in results], 'f1': [m[1] for m in results]})
        return alg

    def perform_classification_ensemble(self, alg, trainX, trainy):
        vtc = VotingClassifier([n for n in alg.sort_values('roc_auc', ascending=False)['models'][:2]],
                               weights=[i for i in alg['roc_auc'].nlargest(2)], voting='soft')
        vtc.fit(trainX, trainy)
        return vtc

    def evaluate_regression_models(self, models, trainX, trainy):
        results = []
        names = []
        scoring = ['neg_root_mean_squared_error', 'r2']
        for name, model in models:
            kfold = KFold(n_splits=7, shuffle=True)
            cv_results = cross_validate(model, trainX, trainy, cv=kfold, scoring=scoring, return_train_score=True)
            results.append([-cv_results['test_neg_root_mean_squared_error'].mean(), cv_results['test_r2'].mean()])
            names.append(name)
        alg = pd.DataFrame({'models': models, 'RMSE': [m[0] for m in results], 'R2': [m[1] for m in results]})
        return alg

    def perform_regression_ensemble(self, alg, trainX, trainy):
        vtr = VotingRegressor([n for n in alg.sort_values('RMSE', ascending=True)['models']][:2],
                              weights=[i for i in alg['RMSE'].nsmallest(2)], voting='hard')
        vtr.fit(trainX, trainy)
        return vtr

    def predict_classification(self, model, testX):
        ypred = model.predict(testX)
        ypredp = model.predict_proba(testX)
        return ypred, ypredp

    def predict_regression(self, model, testX):
        ypred = model.predict(testX)
        return ypred

    def calculate_scores(self, model, trainX, trainy, testX, testy):
        train_score = model.score(trainX, trainy)
        test_score = model.score(testX, testy)
        return train_score, test_score

    def build_classification_model(self, file, target):
        TFrame = self.load_dataframe(file)
        TFrame = self.remove_columns_high_missing_values(TFrame)
        TFrame = self.remove_columns_high_variance(TFrame, cat_cols)
        X = TFrame.drop(target, axis=1)
        y = TFrame[target]
        trainX, testX, trainy, testy = self.split_data(X, y)
        t = [i for i in cat if i not in uniq]
        trainX, testX = self.encode_categorical_features(trainX, testX, t)
        trainX, trainy = self.resample_data(trainX, trainy)
        trainX, testX = self.perform_feature_selection(trainX, testX)
        trainX, testX = self.standardize_data(trainX, testX)
        models = self.get_classification_models()
        alg = self.evaluate_classification_models(models, trainX, trainy)
        vtc = self.perform_classification_ensemble(alg, trainX, trainy)
        ypred, ypredp = self.predict_classification(vtc, testX)
        train_score, test_score = self.calculate_scores(vtc, trainX, trainy, testX, testy)
        algp = pd.DataFrame({f'{target}': testy, 'Predicted': ypred})
        scores = f'Ensemble: The Train score is {train_score}, The Test score is {test_score}'
        return alg, algp.head(), scores

    def build_regression_model(self, file, target):
        TFrame = self.load_dataframe(file)
        TFrame = self.remove_columns_high_missing_values(TFrame)
        TFrame = self.remove_columns_high_variance(TFrame, cat_cols)
        X = TFrame.drop(target, axis=1)
        y = TFrame[target]
        trainX, testX, trainy, testy = self.split_data(X, y)
        t = [i for i in cat if i not in uniq]
        trainX, testX = self.encode_categorical_features(trainX, testX, t)
        trainX, trainy = self.resample_data(trainX, trainy)
        trainX, testX = self.perform_feature_selection(trainX, testX)
        trainX, testX = self.standardize_data(trainX, testX)
        models = self.get_regression_models()
        alg = self.evaluate_regression_models(models, trainX, trainy)
        vtr = self.perform_regression_ensemble(alg, trainX, trainy)
        ypred = self.predict_regression(vtr, testX)
        train_score, test_score = self.calculate_scores(vtr, trainX, trainy, testX, testy)
        algp = pd.DataFrame({f'{target}': testy, 'Predicted': ypred})
        scores = f'Ensemble: The Train score is {train_score}, The Test score is {test_score}'
        return alg, algp.head(), scores



In [3]:
test = Full_course('creditcard.csv', 'Class')

  from pandas import MultiIndex, Int64Index


(284807, 31)
