In [None]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Comment this lines if you have this stuff already installed
#!(yes |pip install geopandas)
#!(yes |pip install descartes)
#!(yes |conda install -c conda-forge geoplot)
#!(yes | pip install plotly)
import plotly.express as px

%config IPCompleter.greedy=True
%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs
import warnings
warnings.filterwarnings('ignore')
from numpy import linalg as LA
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.integrate import odeint

#Gensim
#!pip install gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import KeyedVectors
from gensim import models
import gensim
from gensim import models
from gensim.models import Word2Vec
!pip install pyenchant
import enchant

#Spacy
#!pip install spacy
#!python -m spacy download en_core_web_sm
import spacy
import en_core_web_sm

#NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tag import pos_tag
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('stopwords')

#ML imports
#!pip install catboost no se uso hasta ahora, borrarlo despues
#from catboost import CatBoostRegressor
#!pip install xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
#!pip install lightgbm
import lightgbm as lgbm
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier

import import_ipynb
import RemodulesW2V

In [None]:
def predictionPercentage(X_train, y_train, X_test, y_test, graphic = True):
    '''This function returns a dictionary with the accuracy and the trained algorithm used'''
    predictions = {}
    
    #Random forest
    rf_model = RandomForestClassifier(criterion= "entropy",
        max_depth= 15,
        min_samples_leaf= 10,
        min_samples_split= 10,
        n_estimators= 1200)
    rf_model.fit(X_train, y_train)
    preds = rf_model.predict(X_test)
    predictions['RandomForest'] = [accuracy_score(y_test, preds.round()), rf_model]
    
    if graphic:
        plt.bar(X_train.columns, rf_model.feature_importances_)
        plt.xlabel('Features')
        plt.ylabel('Importancia')
        plt.title('Importancia Features con RandomForest')
        plt.xticks(rotation = 90, horizontalalignment = 'right')
        plt.show()
    
    #Naive bayes
    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    predictions['NaiveBayes'] = [accuracy_score(y_test, y_pred.round()), gnb]
    
    #Tree
    clf = tree.DecisionTreeClassifier(max_features='auto',
                                      min_samples_leaf=11,
                                      min_samples_split = 2,
                                     random_state=123)
    
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    predictions['Tree'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #XGBoost
    xg_clf = xgb.XGBClassifier(colsample_bytree= 1.0,
        gamma= 0.5,
        max_depth = 5,
        min_child_weight = 1,
        subsample = 1.0)

    xg_clf.fit(X_train,y_train)

    preds = xg_clf.predict(X_test)
    predictions['XGBoost'] = [accuracy_score(y_test, preds.round()), xg_clf]
    
    #MLP
    clf = MLPClassifier(activation='tanh',alpha=0.0001,
                        hidden_layer_sizes=(20, ),
                        learning_rate='adaptive',
                        solver='sgd')
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    predictions['MLP'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    
    #LGBM
    clf = lgbm.LGBMClassifier(colsample_bytree = 0.7,max_depth=20, min_split_gain=0.3,
                              n_estimators=400,num_leaves=50, reg_alpha=1.3, 
                              reg_lambda=1.1,subsample=0.7, subsample_freq=20)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    predictions['LGBM'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #KNN
    neigh = KNeighborsClassifier(n_neighbors=10, algorithm='brute', leaf_size=1, n_jobs=-1,
                                weights='distance')
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    predictions['KNN'] = [accuracy_score(y_test, y_pred.round()), neigh]
    
    #GBC
    clf = GradientBoostingClassifier(random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    predictions['GradientBoostingClassifier'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #ExtraTrees clasifier
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    predictions['ExtraTreesClassifier'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #AdaBoost
    clf = AdaBoostClassifier(n_estimators=100, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    predictions['AdaBoost'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #SVM no funciona con features con valores negativos
    
    #Bagging   muy lento: 12min 54s
    #from sklearn.svm import SVC
    #from sklearn.ensemble import BaggingClassifier
    #clf = BaggingClassifier(base_estimator=SVC(),n_estimators=100, random_state=1).fit(X_train, y_train)
    #y_pred = clf.predict(X_test)
    #predictions['Bagging'] = [accuracy_score(y_test, y_pred.round()), clf]
    
    #SGD
    #from sklearn.linear_model import SGDClassifier  #FUNCIONA PERO DA distinto cada vez que corro
    #clf = SGDClassifier(loss="log", penalty="l2", max_iter=5)
    #clf.fit(X_train, y_train)
    #y_pred = clf.predict(X_test)
    #accuracy_score(y_test, y_pred.round())
    
    return predictions

def printPredictions(dicc):
    items = [item for item in dicc.items()]
    sortedItems = sorted(items, key = lambda x: -x[1][0])
    for key, value in sortedItems:
        print('{}: {}\n'.format(key, value[0]))

In [None]:
#Muy bueno, pero muy lento
#CPU times: user 55min 2s, sys: 1min 34s, total: 56min 37s
#Wall time: 33min 21s
#0.8083989501312336

def GaussianClassifier(X_train, y_train, X_test, y_test):
    kernel = 1.0 * RBF(1.0)
    gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(X_train, y_train)
    y_pred = gpc.predict(X_test)
    return accuracy_score(y_test, y_pred.round())

In [None]:
#Estimators example: [('RF', clf1), ('ETC', clf2)]
def votingClassifier(estimators, X_train, y_train):
    '''Returns a votingClassifier train with X_train and y_train'''
    eclf = VotingClassifier(estimators, voting = 'hard').fit(X_train, y_train)
    return eclf

In [None]:
def selectColumns(dataFrame, posTarget): #X tiene que tener todos los features distintos al target
    X, y = dataFrame.iloc[:, posTarget + 1:], dataFrame.iloc[:, posTarget]
    return X, y

## Sin preprocesamiento

In [None]:
train = pd.read_csv('./train.csv')
train = RemodulesW2V._getDisastersDf(train)

In [None]:
natDisaster = train[train.Natural_disaster == 1] #Para dividirlo en dos dataFrames
disaster = train[train.Natural_disaster == 0]

natDisaster.drop('Natural_disaster', axis = 1, inplace = True)
disaster.drop('Natural_disaster', axis = 1, inplace = True)

In [None]:
natDisaster = RemodulesW2V.main(natDisaster)
natDisaster.drop('Natural_disaster', axis = 1, inplace = True)
natDisaster.head()

In [None]:
X, y = selectColumns(natDisaster, 4)

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.1, random_state=41)

predictDicc = predictionPercentage(X_train, y_train, X_test, y_test, False)
printPredictions(predictDicc)