In [None]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Comment this lines if you have this stuff already installed
#!(yes |pip install geopandas)
#!(yes |pip install descartes)
#!(yes |conda install -c conda-forge geoplot)
#!(yes | pip install plotly)
import plotly.express as px

%config IPCompleter.greedy=True
%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs
import warnings
warnings.filterwarnings('ignore')
from numpy import linalg as LA
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.integrate import odeint
from itertools import combinations 

#Gensim
#!pip install gensim
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import KeyedVectors
from gensim import models
from gensim.models import Word2Vec
!pip install pyenchant
import enchant

#Spacy
#!pip install spacy
#!python -m spacy download en_core_web_sm
import spacy
import en_core_web_sm

#NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tag import pos_tag
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('stopwords')

#ML imports
#!pip install catboost no se uso hasta ahora, borrarlo despues
#from catboost import CatBoostRegressor
#!pip install xgboost
#!pip install lightgbm
import lightgbm as lgbm
import xgboost as xgb
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import VotingClassifier
from sklearn import svm

import import_ipynb
import FeaturesCreator
import FeatureSelection

In [None]:
def predictionPercentage(X_train, y_train, X_test, y_test, graphic = True):
    '''This function returns a dictionary with the accuracy and the trained algorithm used'''
    predictions = {}
    
    #Random forest
    rf_model = RandomForestClassifier(criterion= "entropy",
        max_depth= 15,
        min_samples_leaf= 10,
        min_samples_split= 10,
        n_estimators= 1200)
    rf_model.fit(X_train, y_train)
    preds = rf_model.predict(X_test)
    predictions['RandomForest'] = [accuracy_score(y_test, preds.round()), rf_model]
    
    if graphic:
        plt.bar(X_train.columns, rf_model.feature_importances_)
        plt.xlabel('Features')
        plt.ylabel('Importancia')
        plt.title('Importancia Features con RandomForest')
        plt.xticks(rotation = 90, horizontalalignment = 'right')
        plt.show()
    
    #Naive bayes
    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    predictions['NaiveBayes'] = [accuracy_score(y_test, y_pred.round()), gnb]
    
    #Tree
    clf = tree.DecisionTreeClassifier(max_features='auto',
                                      min_samples_leaf=11,
                                      min_samples_split = 2,
                                     random_state=123)
    
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    predictions['Tree'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #XGBoost
    xg_clf = xgb.XGBClassifier(colsample_bytree= 1.0,
        gamma= 0.5,
        max_depth = 5,
        min_child_weight = 1,
        subsample = 1.0)

    xg_clf.fit(X_train,y_train)

    preds = xg_clf.predict(X_test)
    predictions['XGBoost'] = [accuracy_score(y_test, preds.round()), xg_clf]
    
    #MLP
    clf = MLPClassifier(activation='tanh',alpha=0.0001,
                        hidden_layer_sizes=(20, ),
                        learning_rate='adaptive',
                        solver='sgd')
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    predictions['MLP'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    
    #LGBM
    clf = lgbm.LGBMClassifier(colsample_bytree = 0.7,max_depth=20, min_split_gain=0.3,
                              n_estimators=400,num_leaves=50, reg_alpha=1.3, 
                              reg_lambda=1.1,subsample=0.7, subsample_freq=20)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    predictions['LGBM'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #KNN
    neigh = KNeighborsClassifier(n_neighbors=10, algorithm='brute', leaf_size=1, n_jobs=-1,
                                weights='distance')
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    predictions['KNN'] = [accuracy_score(y_test, y_pred.round()), neigh]
    
    #GBC
    clf = GradientBoostingClassifier(random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    predictions['GradientBoostingClassifier'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #ExtraTrees clasifier
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    predictions['ExtraTreesClassifier'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #AdaBoost
    clf = AdaBoostClassifier(n_estimators=100, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    predictions['AdaBoost'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #Logistic Regression
    clf = linear_model.LogisticRegression()
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    predictions['LogisticRegression'] = [accuracy_score(y_test, y_pred.round()), clf]
    #SVM
    #CPU times: user 12min 3s, sys: 411 ms, total: 12min 3s
    #Wall time: 12min 6s
    
    #clf = svm.SVC(C = 6, kernel = 'linear')
    #clf.fit(X_train, y_train)
    #y_pred = clf.predict(X_test)
    #predictions['SVM'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #Bagging   muy lento: 12min 54s
    #from sklearn.svm import SVC
    #from sklearn.ensemble import BaggingClassifier
    #clf = BaggingClassifier(base_estimator=SVC(),n_estimators=100, random_state=1).fit(X_train, y_train)
    #y_pred = clf.predict(X_test)
    #predictions['Bagging'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #SGD
    #from sklearn.linear_model import SGDClassifier  #FUNCIONA PERO DA distinto cada vez que corro
    #clf = SGDClassifier(loss="log", penalty="l2", max_iter=5)
    #clf.fit(X_train, y_train)
    #y_pred = clf.predict(X_test)
    #accuracy_score(y_test, y_pred.round())
    return predictions

def printPredictions(dicc):
    items = [item for item in dicc.items()]
    sortedItems = sorted(items, key = lambda x: -x[1][0])
    for key, value in sortedItems:
        print('{}: {}\n'.format(key, value[0]))

In [None]:
#Muy bueno, pero muy lento
#CPU times: user 55min 2s, sys: 1min 34s, total: 56min 37s
#Wall time: 33min 21s
#0.8083989501312336

def GaussianClassifier(X_train, y_train, X_test, y_test):
    kernel = 1.0 * RBF(1.0)
    gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(X_train, y_train)
    y_pred = gpc.predict(X_test)
    return accuracy_score(y_test, y_pred.round())

In [None]:
#Estimators example: [('RF', clf1), ('ETC', clf2)]
def votingClassifier(estimators, X_train, y_train):
    '''Returns a votingClassifier train with X_train and y_train'''
    eclf = VotingClassifier(estimators, voting = 'hard').fit(X_train, y_train)
    return eclf

In [None]:
def subsets(data, subsetSize):
    return list(combinations(data, subsetSize)) 

def bestCombination(estimatorsList, X_train, y_train, X_test, y_test):
    bestAccuracy = 0
    bestSubsetSize = 2
    bestEstimators = []
    for subsetSize in range(2, len(estimatorsList)): #len(estimatorsList)
        subEstimators = subsets(estimatorsList, subsetSize)
        for estimators in subEstimators:
            eclf = votingClassifier(list(estimators), X_train, y_train)
            y_pred = eclf.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred.round())
            if accuracy > bestAccuracy:
                bestAccuracy = accuracy
                bestSubsetSize = subsetSize
                bestEstimators = estimators
                
    return bestEstimators, bestAccuracy

In [None]:
def selectColumns(dataFrame, posTarget): #X tiene que tener todos los features distintos al target
    X, y = dataFrame.iloc[:, posTarget + 1:], dataFrame.iloc[:, posTarget]
    return X, y

In [None]:
def getTrainedModel(df):
    df = FeaturesCreator.main(df)
    X, y = selectColumns(df, 4)
    X = X[importantFeatures]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 41)
    return predictionPercentage(X_train, y_train, X_test, y_test, False)

In [None]:
def fractionDf(df):
    df = FeaturesCreator._getDisastersDf(df)
    naturalDisasterDf = FeaturesCreator.main(train[train.Natural_disaster == 1]).drop('Natural_disaster', axis = 1)
    disasterDf = FeaturesCreator.main(train[train.Natural_disaster == 0]).drop('Natural_disaster', axis = 1)
    return naturalDisasterDf, disasterDf

In [None]:
#Passive Aggressive Classifier

#Accuracy medio baja
#Lo trabajamos aparte porque sólo usamos la columna de texto
#Probamos usando TF-IDF
#0.7662508207485227

tp1CSV = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv')
labels = tp1CSV.target
x_train,x_test,y_train,y_test = train_test_split(tp1CSV['text'], labels, test_size=0.2, random_state=7)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
#DataFlair - Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(x_train) 
tfidf_test = tfidf_vectorizer.transform(x_test)
#DataFlair - Initialize a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter = 200, n_iter_no_change = 2, validation_fraction = 0.1)
pac.fit(tfidf_train,y_train)
#DataFlair - Predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test,y_pred)
print(score)

Los siguientes algoritmos son un ejemplo de como usar, pero seran dejados como markdown para que no se corran automaticamente ya que son innecesarios para un agente externo

### Features Seleccionados

importantFeatures = FeatureSelection.getImportantFeatures()

## Sin preprocesamiento

train = pd.read_csv('./train.csv')

predictionResult = getTrainedModel(train)
printPredictions(predictionPercentage(X_train, y_train, X_test, y_test, False))

## Con preprocesamiento

train = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv')

getTrainedModel(train)
printPredictions(predictionDicc)

#No se recomienda la corrida de esta celda debido a lo que llega a tardar, pero es un ejemplo de como se usa esta alternativa
X, y = selectColumns(train, 4)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 41)

rf = rf_model = RandomForestClassifier(criterion= "entropy",
                                      max_depth= 15,min_samples_leaf= 10,
                                      min_samples_split= 10,n_estimators= 1200
                                      )

xg_clf = xgb.XGBClassifier(colsample_bytree= 1.0,gamma= 0.5,max_depth = 5,min_child_weight = 1,subsample = 1.0)

lgbm = lgbm.LGBMClassifier(colsample_bytree = 0.7,max_depth=20, min_split_gain=0.3,
                           n_estimators=400,num_leaves=50, reg_alpha=1.3, 
                           reg_lambda=1.1,subsample=0.7, subsample_freq=20
                          )

extraTree = ExtraTreesClassifier(n_estimators=100, random_state=0)

gbc = GradientBoostingClassifier(random_state=0)

estimators = [('RF', rf), ('LGBM', lgbm), ('XGBoost', xg_clf), ('ExtraTree', extraTree), ('GBC', gbc)]

%time result = bestCombination(estimators, X_train, y_train, X_test, y_test)

CPU times: user 1h 5min 37s, sys: 7.3 s, total: 1h 5min 44s
Wall time: 55min 26s

Resultado obtenido:
((('LGBM',
   LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
                  importance_type='split', learning_rate=0.1, max_depth=20,
                  min_child_samples=20, min_child_weight=0.001, min_split_gain=0.3,
                  n_estimators=400, n_jobs=-1, num_leaves=50, objective=None,
                  random_state=None, reg_alpha=1.3, reg_lambda=1.1, silent=True,
                  subsample=0.7, subsample_for_bin=200000, subsample_freq=20)),
  
  ('XGBoost',
   XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
                 colsample_bynode=None, colsample_bytree=1.0, gamma=0.5,
                 gpu_id=None, importance_type='gain', interaction_constraints=None,
                 learning_rate=None, max_delta_step=None, max_depth=5,
                 min_child_weight=1, missing=nan, monotone_constraints=None,
                 n_estimators=100, n_jobs=None, num_parallel_tree=None,
                 objective='binary:logistic', random_state=None, reg_alpha=None,
                 reg_lambda=None, scale_pos_weight=None, subsample=1.0,
                 tree_method=None, validate_parameters=None, verbosity=None))),
 
 0.8097112860892388)

## Fraccionando

## Con preprocesamiento

train = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv')

naturalDisasterDf, disasterDf = fractionDf(train)

train = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv')

naturalDisasterDf, disasterDf = fractionDf(train)

predDiccNatDisast = getTrainedModel(naturalDisasterDf)

printPredictions(predDiccNatDisast)

predDiccNatDisast = getTrainedModel(disasterDf)

printPredictions(predDiccDisast)

## Sin preprocesamiento

train = pd.read_csv('./train.csv')

naturalDisasterDf, disasterDf = fractionDf(train)

predDiccNatDisast = getTrainedModel(naturalDisasterDf)

printPredictions(predDiccNatDisast)

predDiccNatDisast = getTrainedModel(disasterDf)

printPredictions(predDiccDisast)

#A continuacion estan ciertos algoritmos que se usaron con el df fraccionado, NO  se recomienda su uso ya que tardan un tiempo considerable de tiempo.

kernel = 1.0 * RBF(1.0)
gpc = GaussianProcessClassifier(kernel=kernel, random_state=0)
gaussClf = ('GaussianClassifier', gpc)

bestClf = {'XGBoost', 'LGBM', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 'RandomForest'}
natClf = [(key, value[1]) for key, value in predDiccNatDisast.items() if key in bestClf]
natClf.append(gaussClf)
%time result = bestCombination(natClf, X_train, y_train, X_test, y_test)

CPU times: user 55min 51s, sys: 27.8 s, total: 56min 19s
Wall time: 41min 20s

Resultado obtenido:    
((('RandomForest',
   RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                          criterion='entropy', max_depth=15, max_features='auto',
                          max_leaf_nodes=None, max_samples=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=10, min_samples_split=10,
                          min_weight_fraction_leaf=0.0, n_estimators=1200,
                          n_jobs=None, oob_score=False, random_state=None,
                          verbose=0, warm_start=False)),
                          
  ('XGBoost',
   XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                 colsample_bynode=1, colsample_bytree=1.0, gamma=0.5, gpu_id=-1,
                 importance_type='gain', interaction_constraints='',
                 learning_rate=0.300000012, max_delta_step=0, max_depth=5,
                 min_child_weight=1, missing=nan, monotone_constraints='()',
                 n_estimators=100, n_jobs=0, num_parallel_tree=1,
                 objective='binary:logistic', random_state=0, reg_alpha=0,
                 reg_lambda=1, scale_pos_weight=1, subsample=1.0,
                 tree_method='exact', validate_parameters=1, verbosity=None)),
                 
  ('GaussianClassifier',
   GaussianProcessClassifier(copy_X_train=True, kernel=1**2 * RBF(length_scale=1),
                             max_iter_predict=100, multi_class='one_vs_rest',
                             n_jobs=None, n_restarts_optimizer=0,
                             optimizer='fmin_l_bfgs_b', random_state=0,
                             warm_start=False))),
                             
 0.856353591160221)
    
    

%time GaussianClassifier(X_train, y_train, X_test, y_test)
CPU times: user 22min 19s, sys: 42 s, total: 23min 1s
Wall time: 13min 40s  No usamos GaussianClassifier en este caso por el tiempo que demora


bestClf = {'XGBoost', 'LGBM', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 'RandomForest'}
knn = KNeighborsClassifier(n_neighbors=10,
                           algorithm='brute',
                           leaf_size=1,
                           n_jobs=-1,
                           weights='distance'
                          )

tuplaAux = ('KNN', knn)
disastClf = [(key, value[1]) for key, value in predDiccDisast.items() if key in bestClf]
disastClf.append(tuplaAux)
%time result = bestCombination(disastClf, X_train, y_train, X_test, y_test)

CPU times: user 1h 40min 8s, sys: 9.42 s, total: 1h 40min 17s
Wall time: 1h 24min 31s

Resultado obtenido:

((('RandomForest',
   RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                          criterion='entropy', max_depth=15, max_features='auto',
                          max_leaf_nodes=None, max_samples=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=10, min_samples_split=10,
                          min_weight_fraction_leaf=0.0, n_estimators=1200,
                          n_jobs=None, oob_score=False, random_state=None,
                          verbose=0, warm_start=False)),
                          
  ('XGBoost',
   XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                 colsample_bynode=1, colsample_bytree=1.0, gamma=0.5, gpu_id=-1,
                 importance_type='gain', interaction_constraints='',
                 learning_rate=0.300000012, max_delta_step=0, max_depth=5,
                 min_child_weight=1, missing=nan, monotone_constraints='()',
                 n_estimators=100, n_jobs=0, num_parallel_tree=1,
                 objective='binary:logistic', random_state=0, reg_alpha=0,
                 reg_lambda=1, scale_pos_weight=1, subsample=1.0,
                 tree_method='exact', validate_parameters=1, verbosity=None)),
                 
  ('GradientBoostingClassifier',
   GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                              learning_rate=0.1, loss='deviance', max_depth=3,
                              max_features=None, max_leaf_nodes=None,
                              min_impurity_decrease=0.0, min_impurity_split=None,
                              min_samples_leaf=1, min_samples_split=2,
                              min_weight_fraction_leaf=0.0, n_estimators=100,
                              n_iter_no_change=None, presort='deprecated',
                              random_state=0, subsample=1.0, tol=0.0001,
                              validation_fraction=0.1, verbose=0,
                              warm_start=False))),
                              
 0.8264604810996563)

Al fraccionar los Df, usar los algoritmos que generan las mejores predicciones y combinar los resultados el 
submit en kaggle da 0.80171