In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
color = sns.color_palette()
sns.set_style('white')
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, ShuffleSplit, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomTreesEmbedding, VotingClassifier
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from xgboost import XGBClassifier as xgbc

In [14]:
data = pd.read_csv("data_modelado.csv")

In [15]:
pd.set_option("display.max_columns", None)
data.head()

Unnamed: 0,PLAN,TIPO_PLAN,MODALIDAD_APROVECHAMIENTO,UBIGEO,DEPARTAMENTO,PROVINCIA,DISTRITO,MACROREGION,SUPERFICIE,TITULAR,TIPO_PERSONA,AREA_TH,%AREA_TH,AREA_POA,%AREA_POA,NUM_ARBOLES_APROBADOS,CANT_ESPECIES_APROBADOS,VOLUMEN_APROBADO,FECHA_SUPERVISION,VOLUMEN_MOVILIZADO,ARBOLES_SUPERVISADOS,ARBOLES_INEXISTENTES,VOLUMEN_ILEGAL,VOLUMEN_LEGAL,ARBOLES_ILEGAL,%VOLUMEN_MOVILIZADO,%VOLUMEN_ILEGAL,%VOLUMEN_LEGAL,ILEGAL,OBSERVATORIO,OBSERVATORIO_COD
0,DEMA,DEMA,Comunidad Nativa,10401,AMAZONAS,CONDORCANQUI,NIEVA,ORIENTE,448200.0,COMUNIDAD NATIVA YAMAKENTSA,PERSONA JURIDICA,287.65,0.001,287.65,0.001,58,4,648.29,2017-10-17,405.91,54,2,167.331,238.579,0,0.626,0.412,0.588,1,LISTA VERDE,1
1,DEMA,DEMA,Comunidad Nativa,10401,AMAZONAS,CONDORCANQUI,NIEVA,ORIENTE,448200.0,COMUNIDAD NATIVA WAISIM,PERSONA JURIDICA,221.33,0.0,221.33,0.0,72,9,635.98,2017-10-09,460.28,80,2,69.86,390.42,8,0.724,0.152,0.848,1,LISTA VERDE,1
2,DEMA,DEMA,Comunidad Nativa,10403,AMAZONAS,CONDORCANQUI,RIO SANTIAGO,ORIENTE,803500.0,COMUNIDAD NATIVA FORTALEZA,PERSONA JURIDICA,633.33,0.001,633.33,0.001,35,1,467.81,2019-11-04,467.73,44,0,201.733,265.997,9,1.0,0.431,0.569,1,LISTA ROJA,0
3,DEMA,DEMA,Comunidad Nativa,10205,AMAZONAS,BAGUA,IMAZA,ORIENTE,443100.0,COMUNIDAD NATIVA SAWIENTSA,PERSONA JURIDICA,157.91,0.0,157.91,0.0,71,2,649.908,2020-02-22,562.72,63,0,171.0,244.396,0,0.866,0.304,0.434,1,LISTA VERDE,1
4,DEMA,DEMA,Comunidad Nativa,10205,AMAZONAS,BAGUA,IMAZA,ORIENTE,443100.0,COMUNIDAD NATIVA UMPUNCHIG,PERSONA JURIDICA,321.46,0.001,321.46,0.001,62,3,649.973,2019-10-24,649.79,76,0,600.097,49.693,14,1.0,0.924,0.076,1,LISTA ROJA,0


In [16]:
data.drop(["TIPO_PLAN","OBSERVATORIO","FECHA_SUPERVISION"], axis=1, inplace=True)

In [17]:
data["%VOLUMEN_MOVILIZADO"].fillna(0, inplace=True)
data["%VOLUMEN_ILEGAL"].fillna(0, inplace=True)
data["%VOLUMEN_LEGAL"].fillna(0, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6400 entries, 0 to 6399
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PLAN                       6400 non-null   object 
 1   MODALIDAD_APROVECHAMIENTO  6400 non-null   object 
 2   UBIGEO                     6400 non-null   int64  
 3   DEPARTAMENTO               6400 non-null   object 
 4   PROVINCIA                  6400 non-null   object 
 5   DISTRITO                   6400 non-null   object 
 6   MACROREGION                6400 non-null   object 
 7   SUPERFICIE                 6400 non-null   float64
 8   TITULAR                    6400 non-null   object 
 9   TIPO_PERSONA               6400 non-null   object 
 10  AREA_TH                    6400 non-null   float64
 11  %AREA_TH                   6400 non-null   float64
 12  AREA_POA                   6400 non-null   float64
 13  %AREA_POA                  6400 non-null   float

In [18]:
data.shape
data["UBIGEO"] = data["UBIGEO"].astype("object")
feat_num=data.select_dtypes(include=[np.number])
feat_obj=data.select_dtypes(include=["object"])
feat_num.columns

Index(['SUPERFICIE', 'AREA_TH', '%AREA_TH', 'AREA_POA', '%AREA_POA',
       'NUM_ARBOLES_APROBADOS', 'CANT_ESPECIES_APROBADOS', 'VOLUMEN_APROBADO',
       'VOLUMEN_MOVILIZADO', 'ARBOLES_SUPERVISADOS', 'ARBOLES_INEXISTENTES',
       'VOLUMEN_ILEGAL', 'VOLUMEN_LEGAL', 'ARBOLES_ILEGAL',
       '%VOLUMEN_MOVILIZADO', '%VOLUMEN_ILEGAL', '%VOLUMEN_LEGAL', 'ILEGAL',
       'OBSERVATORIO_COD'],
      dtype='object')

In [19]:
data = pd.get_dummies(data, columns=[col for col in data.columns if col not in feat_num.columns]).reset_index(drop=True)
data.columns

Index(['SUPERFICIE', 'AREA_TH', '%AREA_TH', 'AREA_POA', '%AREA_POA',
       'NUM_ARBOLES_APROBADOS', 'CANT_ESPECIES_APROBADOS', 'VOLUMEN_APROBADO',
       'VOLUMEN_MOVILIZADO', 'ARBOLES_SUPERVISADOS',
       ...
       'TITULAR_ZELADA CALDERON JOSE MARTIN',
       'TITULAR_ZEVALLOS MOZOMBITE IRMA',
       'TITULAR_ZEVALLOS ORTIZ FERNANDO ZENON', 'TITULAR_ZUIGA PFURO CORSINO',
       'TITULAR_ZUMAETA FLORES IVAN', 'TITULAR_ZUMAETA RAMIREZ JOSE',
       'TITULAR_µLVAREZ BACA JOS FELIX', 'TITULAR_µVILA DE ARVALO BELN',
       'TIPO_PERSONA_PERSONA JURIDICA', 'TIPO_PERSONA_PERSONA NATURAL'],
      dtype='object', length=4376)

In [20]:
X=data.drop("OBSERVATORIO_COD", axis=1)
y=data[["OBSERVATORIO_COD"]]
print(X.shape)
y.head()

(6400, 4375)


Unnamed: 0,OBSERVATORIO_COD
0,1
1,1
2,0
3,1
4,0


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5120, 4375)
(1280, 4375)
(5120, 1)
(1280, 1)


In [22]:
MLA = [
    AdaBoostClassifier(),
    BaggingClassifier(),
    ExtraTreesClassifier(),
    GradientBoostingClassifier(),
     
    LogisticRegressionCV(solver='lbfgs', max_iter=1000),
    RidgeClassifierCV(),
    Perceptron(),

    #KNeighborsClassifier(n_neighbors=3),
    SVC(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    xgbc() 
]

In [23]:
MLA1 = [
    #make_pipeline(RobustScaler(),AdaBoostClassifier()),
    #make_pipeline(RobustScaler(),BaggingClassifier()),
     
    make_pipeline(RobustScaler(),LogisticRegressionCV(solver='lbfgs', max_iter=1000)),
    make_pipeline(RobustScaler(),RidgeClassifierCV()),

    #make_pipeline(RobustScaler(),SVC()),
    #make_pipeline(RobustScaler(),DecisionTreeClassifier()),
    #make_pipeline(RobustScaler(),xgbc()) 
]

In [24]:
result=[]
name=[]
result1=[]
name1=[]

cv_split = ShuffleSplit(n_splits = 20, test_size = .2, train_size = .7, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

MLA_columns = ['MLA Name', 'MLA Parameters','MLA Score']
MLA_compare = pd.DataFrame(columns = MLA_columns)

row_index = 0
for alg in MLA:
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    cv_results = cross_val_score(alg, X_train, y_train, cv  = cv_split, scoring="accuracy")
    MLA_compare.loc[row_index, 'MLA Score'] = cv_results.mean()
    MLA_compare.loc[row_index, 'MLA Std'] = cv_results.std()
    result.append(cv_results)
    name.append(MLA_name)
    row_index+=1

row_index = 0
for alg in MLA1:
    cv_results1 = cross_val_score(alg, X_train, y_train, cv  = cv_split, scoring="accuracy")
    MLA_compare.loc[row_index, 'MLA Score1'] = cv_results1.mean()
    MLA_compare.loc[row_index, 'MLA Std1'] = cv_results1.std()
    result1.append(cv_results1)
    row_index+=1 

MLA_compare.sort_values(by = ['MLA Score'], ascending = False, inplace = True, ignore_index=True)

MLA_compare

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [None]:
ax=sns.boxplot(result, orient="h")
ax.set_yticklabels(name)
plt.show()

ax=sns.boxplot(result1, orient="h")
ax.set_yticklabels(name)
plt.show()