In [1]:
# class Ademonla:
#     def __init__(self, scoring_function = 'f1', n_iter = 50):
#         self.scoring_function = scoring_function
#         self.n_iter = n_iter
    
#     def fit(self):

In [2]:
def Full_course(file, target):
    # importing all required libraries
    import pandas as pd
    import numpy as np

    from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier, SGDRegressor, Lasso, ElasticNet
    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
    from sklearn.ensemble import RandomForestClassifier,   RandomForestRegressor,\
         VotingClassifier, VotingRegressor, ExtraTreesClassifier,ExtraTreesRegressor, GradientBoostingClassifier,GradientBoostingRegressor
    from sklearn.svm import SVC, SVR
    from sklearn.neural_network import MLPClassifier, MLPRegressor
    from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
    from xgboost.sklearn import XGBClassifier, XGBRegressor
    from catboost import CatBoostClassifier,CatBoostRegressor
    from lightgbm import LGBMClassifier, LGBMRegressor
    
    
    # from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_validate, StratifiedKFold
    from sklearn.pipeline import Pipeline
    from imblearn.over_sampling import SMOTE
    from sklearn.preprocessing import StandardScaler , RobustScaler
    import category_encoders as ce
    from sklearn.impute import KNNImputer, SimpleImputer
    from sklearn.decomposition import PCA

    import warnings
    warnings.filterwarnings('ignore')
    
    # Loading the dataframe
    TFrame = pd.read_csv(file,encoding='cp1252')

    print(TFrame.shape)

    # Removes columns with more than 70% missing values
    crazy = [i for i in TFrame.columns if TFrame[i].isna().sum() >= 0.7*len(TFrame[i])]
    TFrame = TFrame.drop(crazy, axis=1)

   # Sorting based on datatypes
    date = [d for d in TFrame.columns if 'Date' in d]
    cat = [n for n in TFrame.columns if TFrame[n].dtype == object and n not in date]
    num = [p for p in TFrame.columns if p not in cat]

    # filling up missing values
    imp = KNNImputer(n_neighbors=5)
    missing_num = [i for i in num if TFrame[i].isna().sum() != 0]
    if missing_num != []:
        TFrame[missing_num] = imp.fit_transform(TFrame[missing_num])

    obj = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
    missing_cat = [i for i in cat if TFrame[i].isna().sum() != 0]
    if missing_cat != []:
        TFrame[missing_cat] = obj.fit_transform(TFrame[missing_cat])
        TFrame[cat] = TFrame[cat].astype('category')

    # Dealing with date features
    if date != []:
        TFrame[date] = obj.fit_transform(TFrame[date])

        for i in date:
            TFrame[i] = pd.to_datetime(TFrame[i]).astype('category')
            TFrame[f'{i}_months'] = TFrame[i].dt.month.astype('category')
            TFrame[f'{i}_days'] = TFrame[i].dt.day.astype('category')
            TFrame[f'{i}_years'] = TFrame[i].dt.year.astype('category')

        TFrame = TFrame.drop(date, axis = 1)
    
    # # Skewness in data
    # sk = [i for i in TFrame.columns if abs(TFrame[i].skew()) > 1]
    # for i in sk:
    #     TFrame[i] = TFrame[i].apply(lambda i: np.log(i) if i > 0 else 0)

    # Removing columns with high variance
    uniq = [i for i in cat if TFrame[i].nunique() >= 0.7 * TFrame.shape[0]]
    TFrame = TFrame.drop(uniq, axis=1)

    X = TFrame.drop(target, axis =1)
    y = TFrame[target]

    # Train and test split - test size at 20%
    trainX, testX, trainy, testy = train_test_split(X, y, test_size = 0.2, random_state = 101)
    t = [i for i in cat if i not in uniq]

    #Selecting Classification or Regression--- Ensemble
    if y.nunique() < 10: 
        # category encoding 
        encode = ce.BinaryEncoder(t)
        trainX = encode.fit_transform(trainX)
        testX = encode.transform(testX)

        # Resampling
        if np.mean(list(trainX.value_counts())) > 1.5 * min(list(trainX.value_counts())):
            oversample = SMOTE()
            trainX, trainy = oversample.fit_resample(trainX, trainy)

        
        # Feature Selection 
        if len(trainX.columns) > 20:
            dec = PCA(0.98)
            trainX = dec.fit_transform(trainX)
            testX = dec.transform(testX)

        # Standardization
        sc = RobustScaler()
        trainX = sc.fit_transform(trainX)
        testX = sc.transform(testX)

        # Modeling
        # Finding the BaseLine perfomance of the various models
        models = []

        # Adding algorthms
        models.append(('cat', CatBoostClassifier(verbose=0)))
        models.append(('mlp', MLPClassifier(verbose=0)))
        models.append(('lgr', LogisticRegression(verbose=0)))
        models.append(('svc', SVC(probability=True)))
        models.append(('knc', KNeighborsClassifier()))
        models.append(('lgbm', LGBMClassifier()))
        models.append(('dctc', DecisionTreeClassifier()))
        models.append(('sgd', SGDClassifier()))
        models.append(('ext', ExtraTreesClassifier()))
        models.append(('rfc', RandomForestClassifier()))
        models.append(('gbc', GradientBoostingClassifier()))
        models.append(('xgb', XGBClassifier()))

        # evaluate -cross validation- each model in turn
        results = []
        mods = []
        names = []
        scoring =['roc_auc', 'f1']
        for name, model in models:
            kfold = StratifiedKFold(n_splits=7, shuffle= True)
            cv_results = cross_validate(model, trainX, trainy, cv=kfold, scoring=scoring, return_train_score=True)
            results.append([cv_results['test_roc_auc'].mean(), cv_results[f'test_f1'].mean()])
            mods.append(model)
            names.append(name)
        alg = pd.DataFrame({'models': models, 'roc_auc':[m[0] for m in results] , 'f1': [m[1] for m in results]})
        # Ensemble
        vtc = VotingClassifier([n for n in alg.sort_values('roc_auc', ascending=False)['models'][:2]], 
                                weights =[i for i in alg['roc_auc'].nlargest(2)], voting='soft')
        vtc.fit(trainX, trainy) # fit the model in the Train set
        # if vote < alg.metric.max():
        #   alg.metric.max()['models']
        
        
        # Predict the Test file
        ypred = vtc.predict(testX)
        ypredp = vtc.predict_proba(testX)
       
        # Check the Scores
        sol = vtc.score(trainX, trainy)    
        sol1 = vtc.score(testX, testy)

    else:
        # category encoding 
        encode = ce.TargetEncoder(cols=t, smoothing=8, min_samples_leaf=5)
        trainX = encode.fit_transform(trainX, trainy)
        testX = encode.transform(testX)
        

        # Feature Selection 
        if len(trainX.columns) > 20:
            dec = PCA(0.98)
            trainX = dec.fit_transform(trainX)
            testX = dec.transform(testX)
       

        # Standardization
        sc = RobustScaler()
        trainX = sc.fit_transform(trainX)
        testX = sc.transform(testX)
        

        # Modeling
        # Finding the BaseLine perfomance of the various models

        # Prepare models
        models = []

        # Adding algorthms
        models.append(('cat', CatBoostRegressor(verbose=0)))
        models.append(('mlp', MLPRegressor(verbose=0)))
        models.append(('lr', LinearRegression()))
        models.append(('lss', Lasso()))
        models.append(('eln', ElasticNet()))
        models.append(('svc', SVR()))
        models.append(('knc', KNeighborsRegressor()))
        models.append(('dctr', DecisionTreeRegressor())) 
        models.append(('sgd', SGDRegressor()))
        models.append(('lgbm', LGBMRegressor()))
        models.append(('ext', ExtraTreesRegressor()))
        models.append(('rfr', RandomForestRegressor()))
        models.append(('gbr', GradientBoostingRegressor()))
        models.append(('xgb', XGBRegressor()))
       
        # evaluate -cross validation- each model in turn
        results = []
        names = []
        scoring =['neg_root_mean_squared_error', 'r2']
        for name, model in models:
            kfold = KFold(n_splits=7, shuffle= True)
            cv_results = cross_validate(model, trainX, trainy, cv=kfold, scoring=scoring, return_train_score=True)
            results.append([-cv_results['test_neg_root_mean_squared_error'].mean(),cv_results[f'test_r2'].mean()])
            names.append(name)
        alg = pd.DataFrame({'models': models, 'RMSE':[m[0] for m in results], 'R2':[m[1] for m in results]})
        
        # Ensemble
        vtr = VotingRegressor([n for n in alg.sort_values('RMSE', ascending=True)['models']][:2], 
                                weights =[i for i in alg['RMSE'].nsmallest(2)], voting = 'hard')
        vtr.fit(trainX, trainy) # fit the model in the Train set

        
        # Predict the Test file
        ypred = vtr.predict(testX)
    
        # Check the Scores
        sol = vtr.score(trainX, trainy)    
        sol1 = vtr.score(testX, testy)

    # Outcomes
    algp = pd.DataFrame({f'{target}': testy ,'Predicted': ypred})
    Scores = f'Ensemble: The Train score is {sol}, The Test score is {sol1}'
    print(Scores)


    return alg, algp.head()

In [3]:
Full_course('creditcard.csv', 'Class')

  from pandas import MultiIndex, Int64Index


(284807, 31)
