In [None]:
#Libraries for data manipulation
import pandas as pd
import numpy as np

#Library for splitting data
from sklearn.model_selection import train_test_split 

#Library for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

In [None]:
#Dictionary with the best hyperparameters for each model
hyperparameters = {
    #'randomForest' : ,
    #'xgboost' : ,
    #'decissionTreClassifier' : ,
    #'knn' : ,
    #'svm': ,
    #'lgb' : ,
    'passiveAggressiveClassifier' : {'max_iter': 150, 'n_iter_no_change': 1, 'validation_fraction': 0.1}
}

In [None]:
trainCSV = pd.read_csv('./forHiper')
trainCSV.head()

In [None]:
xTrain, yTrain = trainCSV.iloc[:,5:], trainCSV.iloc[:,4]  #X tiene que tener todos los features distintos al target

In [None]:
#Searchs for the best hyperparameters
def findHyperparameters(xTrain, yTrain, model, param_grid, cv, vb, nj):
    
    grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'roc_auc', cv = cv, verbose = vb, n_jobs = nj)
    grid.fit(xTrain, yTrain)
    return grid.best_params_

## Random forest

https://towardsdatascience.com/optimizing-hyperparameters-in-random-forest-classification-ec7741f9d3f6

In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators' : [100, 300, 500, 800, 1200],
    'max_depth' : [5, 8, 15, 25, 30],
    'min_samples_split' : [2, 5, 10, 15, 100],
    'min_samples_leaf' : [1, 2, 5, 10],
    'criterion' : ['gini','entropy']
}

RC = RandomForestClassifier()
bestParametersRF = findHyperparameters(xTrain, yTrain, RC, param_grid, 3, 1, -1) #bestParametersRF
bestParametersRF

## XGBoost

https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
folds = 3
param_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
}
XGB = XGBClassifier()
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
bestParametersXGB = findHyperparameters(xTrain, yTrain, XGB, param_grid, skf.split(xTrain, yTrain), 3, 4)
bestParametersXGB

## Decission Tree Classifier

https://www.kaggle.com/mayu0116/hyper-parameters-tuning-of-dtree-rf-svm-knn

In [None]:
from sklearn import tree

param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
          'min_samples_split': [2,3,4,5,6,7,8,9,10,11,12,13,14,15], 
          'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11],
          'random_state':[123]}

DTC = tree.DecisionTreeClassifier(min_samples_split = 10)
bestParameteresDTC = findHyperparameters(xTrain, yTrain, DTC, param_grid, 3, 1, -1)
bestParameteresDTC

## KNN

https://www.kaggle.com/mayu0116/hyper-parameters-tuning-of-dtree-rf-svm-knn

In [None]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {'n_neighbors':[5,6,7,8,9,10],
          'leaf_size':[1,2,3,5],
          'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute'],
          'n_jobs':[-1]}

KNN = KNeighborsClassifier()
bestParameteresKNN = findHyperparameters(xTrain, yTrain, KNN, param_grid, 3, 1, 1)
bestParameteresKNN

## Support Vector Machine

https://www.kaggle.com/mayu0116/hyper-parameters-tuning-of-dtree-rf-svm-knn

In [None]:
from sklearn import svm

param_grid = {'C': [6,7,8,9,10,11,12], 
          'kernel': ['linear','rbf']}

SVC = svm.SVC()
bestParameteresSVC = findHyperparameters(xTrain, yTrain, SVC, param_grid, 3, 1, -1)
bestParameteresSVC

## Light GBM

https://mlfromscratch.com/gridsearch-keras-sklearn/#/

In [None]:
import lightgbm as lgb

param_grid = {
    'n_estimators': [400, 700, 1000],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [15,20,25],
    'num_leaves': [50, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20]
}

LGBM = lgb.LGBMClassifier()

bestParameteresLGBM = findHyperparameters(xTrain, yTrain, LGBM, param_grid, 5, 2, -1)
bestParameteresLGBM

## MLPClassifier

https://medium.com/@panjeh/scikit-learn-hyperparameter-optimization-for-mlpclassifier-4d670413042b

In [None]:
from sklearn.neural_network import MLPClassifier

param_grid = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

MLP = MLPClassifier(max_iter=100)

bestParameteresMLP = findHyperparameters(xTrain, yTrain, MLP, param_grid, 5, 2, -1)
bestParameteresMLP

## Passive Aggressive Classifier

In [None]:
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier

param_grid = {
    'max_iter' : [70, 90, 100, 150, 200],
    'validation_fraction' : [0.1, 0.4, 0.6, 0.8],
    'n_iter_no_change' : [1, 2, 3, 4, 5, 6, 7, 8]
    
}

tp1CSV = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv')
labels = tp1CSV.target
x_train,x_test,y_train,y_test = train_test_split(tp1CSV['text'], labels, test_size=0.2, random_state=7)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
#DataFlair - Fit and transform train set, transform test set
xTrain = tfidf_vectorizer.fit_transform(x_train) 
xTest = tfidf_vectorizer.transform(x_test)

PAC = PassiveAggressiveClassifier()
bestParameteresPAC = findHyperparameters(xTrain, y_train, PAC, param_grid, 5, 2, -1)
bestParameteresPAC

## Gradient Boost Classifier

https://www.kaggle.com/hatone/gradientboostingclassifier-with-gridsearchcv

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
    }

GBC = GradientBoostingClassifier()
bestParameteresGBC = findHyperparameters(xTrain, yTrain, GBC, param_grid, 10, 1, -1)
bestParameteresGBC

## ExtraTree Classifier

https://www.kaggle.com/eikedehling/extra-trees-tuning

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

param_grid={
        'n_estimators': range(50,126,25),
        'max_features': range(50,401,50),
        'min_samples_leaf': range(20,50,5),
        'min_samples_split': range(15,36,5),
}

ETC = ExtraTreesRegressor()
bestParameteresETC = findHyperparameters(xTrain, yTrain, ETC, param_grid, 5, 2, -1)
bestParameteresETC

## Adaboost Classifier

https://educationalresearchtechniques.com/2019/01/02/adaboost-classification-in-python/

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold

param_grid = {    
    'n_estimators':[500,1000,2000],
    'learning_rate':[.001,0.01,.1]
}

crossvalidation = KFold(n_splits=10,shuffle=True,random_state=1)
ADA = AdaBoostClassifier()
bestParameteresADA = findHyperparameters(xTrain, yTrain, ADA, param_grid, crossvalidation, 2, -1)
bestParameteresADA