## Import Libraries

In [976]:
import pandas as pd
import os

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
#importear de preprocessing las normalizaciones
from sklearn.preprocessing import Normalizer

#dense transformer
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#random forest
from sklearn.ensemble import RandomForestRegressor

#LogisticRegression
from sklearn.linear_model import LogisticRegression

#svm 
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier

def csv(predicciones,nombre,df_test):
    y_pred = pd.DataFrame({'id':df_test['id'],'Predicted':predicciones})
    y_pred['Predicted'] = y_pred['Predicted'].astype(int)
    y_pred.to_csv(nombre,index=False)

#oversampling

from imblearn.over_sampling import SMOTE

#gridsearch y kfold
from sklearn.model_selection import GridSearchCV, KFold


## Load Data

In [980]:
directorio = '../apau-smog-prediction/'
lista = os.listdir(directorio)
for i in lista:
    exec(i.split('.')[0] + ' = pd.read_csv(directorio + i)')

In [981]:
smog = train['Smog']
train = train.drop(['id','Comb (L/100 km)','Comb (mpg)','Model','Model Year','Smog'], axis=1)
id =test_nolabel['id']
test_nolabel = test_nolabel.drop(['id','Comb (L/100 km)','Comb (mpg)','Model','Model Year'], axis=1)

In [982]:
#train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, smog, test_size=0.2, random_state=42)

In [983]:
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
num_cols = X_train.select_dtypes(exclude="object").columns.tolist()

cat_pipe = Pipeline([
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

num_pipe = Pipeline([
    ("scaler", StandardScaler())
])
norm_pipe = Pipeline([
    ("norm", Normalizer())
])

preprocessor = ColumnTransformer([
    ("cat", cat_pipe, cat_cols),
    ("num", num_pipe, num_cols)
])

preprocessor_norm = ColumnTransformer([
    ("cat", cat_pipe, cat_cols),
    ("num", norm_pipe, num_cols),
    ("norm", norm_pipe, num_cols)
])


Arboles de decisión

Random Forest

In [984]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet
#functiontransformer
from sklearn.preprocessing import FunctionTransformer
#cross validation
from sklearn.model_selection import cross_val_score, cross_val_predict


rf = Pipeline([
    ("preprocessor", preprocessor),
    ("rf", RandomForestClassifier( random_state=42))
])

predictions = cross_val_predict(rf, train, smog, cv=5)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print(classification_report(smog, predictions))
print(f1_score(y_test, y_pred, average='macro'))

              precision    recall  f1-score   support

           0       0.87      0.79      0.83        33
           1       0.80      0.75      0.78       125
           2       0.79      0.79      0.79       197
           3       0.88      0.80      0.84       112
           4       0.71      0.84      0.77       119

    accuracy                           0.79       586
   macro avg       0.81      0.79      0.80       586
weighted avg       0.80      0.79      0.79       586

0.7977982518308527


In [929]:
#nombre de los parametros que generalicen mejor el modelo: 

gd_rf = GridSearchCV(rf, {"rf__n_estimators": [100, 200, 300, 400, 500],"rf__max_depth": [5, 10, 15, 20, 25]}, cv=5, n_jobs=-1)

In [933]:
gd_rf.fit(X_train, y_train)

In [936]:
print(classification_report(y_test, gd_rf.predict(X_test)))
print(f1_score(y_test, gd_rf.predict(X_test), average='macro'))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       0.75      0.69      0.72        26
           2       0.74      0.67      0.71        43
           3       0.78      0.78      0.78        23
           4       0.72      0.91      0.81        23

    accuracy                           0.75       118
   macro avg       0.80      0.81      0.80       118
weighted avg       0.76      0.75      0.75       118

0.8035236153030427


In [985]:
#gradient boosting
from sklearn.ensemble import GradientBoostingClassifier

gb = Pipeline([
    ("preprocessor", preprocessor),
    ("gb", GradientBoostingClassifier(random_state=42))
])

predictions = cross_val_predict(gb, train, smog, cv=5)
print(classification_report(smog, predictions))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92        33
           1       0.84      0.74      0.78       125
           2       0.74      0.79      0.76       197
           3       0.80      0.75      0.77       112
           4       0.76      0.82      0.79       119

    accuracy                           0.78       586
   macro avg       0.81      0.80      0.81       586
weighted avg       0.79      0.78      0.78       586



In [986]:
#xgboost
from xgboost import XGBClassifier

xgb = Pipeline([
    ("preprocessor", preprocessor),
    ("xgb", XGBClassifier(random_state=42))
])

predictions = cross_val_predict(xgb, train, smog, cv=5)
print(classification_report(smog, predictions))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91        33
           1       0.76      0.69      0.72       125
           2       0.74      0.77      0.75       197
           3       0.84      0.79      0.81       112
           4       0.70      0.76      0.73       119

    accuracy                           0.76       586
   macro avg       0.79      0.78      0.78       586
weighted avg       0.76      0.76      0.76       586



In [424]:
gd_rf.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits




In [425]:
f1_score(y_test, gd_rf.predict(X_test), average='macro')

0.7415991140642303

Transformacíón geométrica

In [987]:
svm = Pipeline([
    ("preprocessor", preprocessor),
    ("svm", SVC(class_weight='balanced', random_state=42)),
])
predictions = cross_val_predict(svm, train, smog, cv=5)
print(classification_report(smog, predictions))


              precision    recall  f1-score   support

           0       0.91      0.94      0.93        33
           1       0.77      0.72      0.74       125
           2       0.79      0.60      0.68       197
           3       0.81      0.74      0.78       112
           4       0.58      0.90      0.71       119

    accuracy                           0.73       586
   macro avg       0.77      0.78      0.77       586
weighted avg       0.76      0.73      0.73       586



In [943]:
gd_svm.fit(X_train, y_train)

10 fits failed out of a total of 70.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\andre\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\andre\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\andre\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\svm\_base.py", line 180, in fit
    self._validate_params()
  File "c:\Users\andre\AppData\Local\Programs\Python\Python

In [944]:
f1_score(y_test, gd_svm.predict(X_test), average='macro')

0.7676791254842591

In [988]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
KNN = Pipeline([
    ("preprocessor", preprocessor),
    ("knn", KNeighborsClassifier()),
])

predictions = cross_val_predict(KNN, train, smog, cv=5)
print(classification_report(smog, predictions))



              precision    recall  f1-score   support

           0       0.75      0.73      0.74        33
           1       0.69      0.66      0.67       125
           2       0.69      0.69      0.69       197
           3       0.76      0.77      0.76       112
           4       0.65      0.69      0.67       119

    accuracy                           0.70       586
   macro avg       0.71      0.71      0.71       586
weighted avg       0.70      0.70      0.70       586



In [989]:
#mas transformaciones geometricas
from sklearn.preprocessing import PolynomialFeatures
poly = Pipeline([
    ("preprocessor", preprocessor),
    ("poly", PolynomialFeatures(degree=2)),
    ("lr", LogisticRegression())
])

poly.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [956]:
f1_score(y_test, poly.predict(X_test), average='macro')

0.7727898949842109

Gaussian Naive Bayes

In [959]:
train_dummies = pd.get_dummies(train)
test_dummies = pd.get_dummies(test_nolabel)

In [960]:
#que aparecezcan las mismas columnas en train y test
train_dummies, test_dummies = train_dummies.align(test_dummies, join='left', axis=1)

In [962]:
gnb = GaussianNB()
gnb.fit(train_dummies, smog)

In [964]:
def standard_scale(df):
    '''Devuelve un dataframe con las columnas escaladas'''
    scaler = StandardScaler()
    string = df.select_dtypes(include=['object']).columns #solo vamos a escalar las columnas que no son string
    df_no_string = df.drop(string,axis=1)
    df_no_string = scaler.fit_transform(df_no_string)
    df_no_string = pd.DataFrame(df_no_string,columns=df.drop(string,axis=1).columns)
    df_escaled = df.copy()
    df_escaled[df_no_string.columns] = df_no_string
    return df_escaled

train_escaled = standard_scale(train)
test_escaled = standard_scale(test_nolabel)

In [965]:
train_dummies = pd.get_dummies(train)
test_dummies = pd.get_dummies(test_nolabel)

In [966]:
train_dummies, test_dummies = train_dummies.align(test_dummies, join='left', axis=1)

In [969]:
gnb = GaussianNB()
gnb.fit(train_dummies, smog, sample_weight=[1.5 if i==0 else 1 for i in smog])
f1_score(smog, gnb.predict(train_dummies), average='macro')

0.6114565413658589

Stacking, Bagging, Boosting y Voting


Stacking

In [1022]:
#stacking
from sklearn.ensemble import StackingClassifier

#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
estimators = [
    ('rf', rf),
    ('svm', svm),
    ('knn', KNN)
]
estimators_1 = [
    ('rf', rf),
    ('svm', svm),
    ('knn', KNN)
]
#para el estimador final se puede usar un random forest
stack = StackingClassifier(estimators=estimators, final_estimator=DecisionTreeClassifier())
stack1 = StackingClassifier(estimators=estimators_1, final_estimator=DecisionTreeClassifier())

stack.fit(X_train, y_train)

In [1013]:
f1_score(y_test, stack.predict(X_test), average='macro')

0.7444666845504633

voting

In [1026]:
#voting
from sklearn.ensemble import VotingClassifier
estimators = [
    ('rf', rf),
    ('svm', svm),
    ('knn', KNN),
    ('xgb', xgb),
    ('gb', gb),
    ('poly', poly),
    ('stack', stack),
    ('stack1', stack1)

]
#para el estimador final se puede usar un random forest
voting = VotingClassifier(estimators=estimators)

In [1027]:
voting.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [1028]:
f1_score(y_test, voting.predict(X_test), average='macro')

0.8046688310254201

bagging

In [1033]:
#la libreria d pasting es la misma que la de bagging
from sklearn.ensemble import PastingClassifier

pasting = PastingClassifier(estimators=estimators, n_jobs=-1)


ImportError: cannot import name 'PastingClassifier' from 'sklearn.ensemble' (c:\Users\andre\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\ensemble\__init__.py)

In [1029]:
predictions = voting.predict(test_nolabel)

In [1030]:
test = test_nolabel.copy()
test['id'] = id
csv(predictions,'voting.csv',test)

In [907]:
#quiero combinar todos los knn, svm y random forest para ello voy a hacer un voting classifier o un stacking classifier
from sklearn.ensemble import StackingClassifier
#boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
#XGBClassifier
from xgboost import XGBClassifier

ada = Pipeline([
    ("preprocessor", preprocessor),
    ("ada", GradientBoostingClassifier()),
])  

#PARAMETROS DE GRADIEN BOOSTING QUE regularizan: 
#n_estimators: numero de arboles: cuantos mas arboles mas complejo es el modelo y menos generaliza un buen numero es 100 ya 
#learning_rate: cuanto mas pequeño mas robusto es el modelo, mas robusto sig

gd_ada = GridSearchCV(ada, param_grid={"ada__n_estimators":[100,200],"ada__max_depth":[2,3]},cv=cv, scoring="f1_macro")
gd_svm = GridSearchCV(svm, param_grid={"svm__C":[0.01]},cv=cv, scoring="f1_macro")#para un C ma


estimators = [
    ('svm', svm),
    ('ada', rf)
]

#KNN
from sklearn.neighbors import KNeighborsClassifier

#kfold blending
from sklearn.model_selection import KFold

#stacking con k
from sklearn.ensemble import StackingClassifier

#k
k = 5
#kfold
kf = KFold(n_splits=k, shuffle=True, random_state=42)



clf_1 = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(),cv=kf, n_jobs=-1)



In [908]:
clf_1.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [909]:
predictions = clf_1.predict(X_test)

In [910]:
f1_score(y_test, predictions, average='macro')

0.7671836564519492

In [769]:
confusion_matrix(smog, predictions)

array([[ 33,   0,   0,   0,   0],
       [  0, 124,   0,   0,   1],
       [  0,   0, 197,   0,   0],
       [  0,   0,   1, 111,   0],
       [  0,   0,   0,   0, 119]], dtype=int64)

In [764]:
predictions = clf_1.predict(test_nolabel)

In [766]:
test = test_nolabel.copy()
test['id'] = id
csv(predictions,'rf_svm_logreg_2.csv',test)

In [631]:
clf_1.fit(X_train,y_train)#para el stacking: train, smog, para el voting: X_train, y_train 


In [632]:
f1_score(y_test, clf_1.predict(X_test), average='macro')

0.7878656126482213

In [610]:
#tengo clf1 ahora quiero mejorar el modelo con bagging o boosting
#boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

clf_2 = AdaBoostClassifier(base_estimator=clf_1, n_estimators=100, random_state=42)




ValueError: Pipeline.fit does not accept the sample_weight parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format, e.g. `Pipeline.fit(X, y, logisticregression__sample_weight=sample_weight)`.

ValueError: Pipeline.fit does not accept the sample_weight parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format, e.g. `Pipeline.fit(X, y, logisticregression__sample_weight=sample_weight)`.

In [628]:
predicciones = clf_1.predict(test_nolabel)

In [629]:
test = test_nolabel.copy()
test['id'] = id
csv(predicciones,'ada_rf_log_kf.csv',test)

In [558]:
#redes neuronales que pueden ser interesantes para un dataset pequeño son: mlp, perceptron, mlpclassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor
#import mlp y perceptron
from sklearn.linear_model import Perceptron

mlp = Pipeline([
    ("preprocessor", preprocessor),
    ("mlp", Perceptron()),
])

cv = KFold(n_splits=10, shuffle=True, random_state=42)
mlp = GridSearchCV(mlp, param_grid={"mlp__n_features_in_":[1,5,20]},cv=cv, scoring="f1_macro")
mlp.fit(X_train, y_train)

ValueError: Invalid parameter 'n_features_in_' for estimator Perceptron(). Valid parameters are: ['alpha', 'class_weight', 'early_stopping', 'eta0', 'fit_intercept', 'l1_ratio', 'max_iter', 'n_iter_no_change', 'n_jobs', 'penalty', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'].

In [556]:
f1_score(y_test, mlp.predict(X_test), average='macro')

0.7763275613275613

In [490]:
ada.fit(X_train, y_train)

In [491]:
f1_score(y_test, ada.predict(X_test), average='macro')

0.8017932614254072

Gaussian Naive Bayes

In [334]:
#GaussianNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

# Supongamos que X_train es una matriz dispersa
X_train_dense = X_train.to_numpy()
gnb = Pipeline([
    ("preprocessor", preprocessor),
    ("gnb", GaussianNB())
])

#atributos de pipeline: steps, named_steps, get_params, set_params

AttributeError: 'Pipeline' object has no attribute 'preprocessor'

In [318]:
gnb.fit(X_train, y_train)

TypeError: np.matrix is not supported. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html

In [262]:
#gridsearch con random forest

param_grid = {
    "rf__n_estimators": [10, 50, 100],
    "rf__max_features": ["auto", "sqrt", "log2"],
}

gd = GridSearchCV(rf, param_grid=param_grid, cv=5, n_jobs=-1)

In [263]:
gd.fit(X_train, y_train)

In [265]:
gd.best_params_

{'rf__max_features': 'sqrt', 'rf__n_estimators': 100}

In [None]:
RandomForestClas

In [271]:
# random forest regression
rf = Pipeline([
    ("preprocessor", preprocessor),
    ("rf", RandomForestClassifier(n_estimators=100, random_state=69))
    
])



In [272]:
rf.fit(X_train, y_train)

In [273]:
#adaboost
from sklearn.ensemble import AdaBoostClassifier

ada = Pipeline([
    ("preprocessor", preprocessor),
    ("ada", RandomForestClassifier())
    
])

ada.fit(X_train, y_train)




In [274]:
ada_pred = ada.predict(X_test)
f1_score(y_test, ada_pred, average='macro')

0.7558194875875032

In [143]:
bag.fit(X_train, y_train)



ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [137]:
f1_score(y_train, y_pred, average='macro')

0.6395502890291678

In [114]:
#quiero hacer un grid search para ver si puedo mejorar el modelo
param_grid = {
    'rf__n_estimators': [100, 200, 300, 1000],
    'rf__max_features': ['auto', 'sqrt', 'log2'],
    'rf__max_depth' : [4,5,6,7,8],
    'rf__criterion' :['gini', 'entropy']
}

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [115]:
grid_search.best_params_

{'rf__criterion': 'entropy',
 'rf__max_depth': 8,
 'rf__max_features': 'log2',
 'rf__n_estimators': 100}

In [116]:
#ahora que tengo los mejores parametros, voy a probar el modelo con ellos
rf = Pipeline([
    ("preprocessor", preprocessor),
    ("rf", RandomForestClassifier(n_estimators=100, random_state=69, max_depth=8, max_features='log2', criterion='entropy'))
])

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

In [131]:
f1_score(y_test, y_pred, average='macro')

0.7859236889646701

In [238]:
# svm
svm = Pipeline([
    ("preprocessor", preprocessor),
    ("svm", SVC())
])
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

svm.fit(X_resampled, y_resampled).score(X_test, y_test)

svm.fit(X_train, y_train).score(X_test, y_test)


0.7457627118644068

In [106]:
f1_score(y_test, y_pred, average='macro')

0.775311184364853

In [107]:
#kkN
from sklearn.neighbors import KNeighborsClassifier
knn = Pipeline([
    ("preprocessor", preprocessor),
    ("knn", KNeighborsClassifier())
])

knn.fit(X_train, y_train).score(X_test, y_test)





0.7288135593220338

In [111]:
## random forest regression
rfr = Pipeline([
    ("preprocessor", preprocessor),
    ("rfr", RandomForestRegressor(n_estimators=100, random_state=69))
])

rfr.fit(X_train, y_train).score(X_test, y_test)


0.5305976629284697

In [239]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=69)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


ValueError: could not convert string to float: 'Porsche'

In [304]:
clf.fit(X_resampled, y_resampled)

In [305]:
f1_score(y_test, clf.predict(X_test), average='macro')

0.8161055726638482

In [306]:
y_pred = clf.predict(train)

In [307]:
f1_score(smog, y_pred, average='macro')

0.9571675146193688

In [308]:
#confusion matrix
confusion_matrix(smog, y_pred)

array([[ 33,   0,   0,   0,   0],
       [  0, 116,   4,   0,   5],
       [  0,   3, 184,   7,   3],
       [  0,   0,   3, 107,   2],
       [  0,   0,   3,   0, 116]], dtype=int64)

In [229]:
predicciones = clf.predict(test_nolabel)

In [180]:
test_nolabel['id'] = id

In [230]:
csv(predicciones,'rf_svm__knn_logreg_todo.csv',test_nolabel)

In [159]:
#confusion matrix
confusion_matrix(smog, y_pred)

array([[ 31,   2,   0,   0,   0],
       [  6,  81,  22,   2,  14],
       [  7,  17, 158,   9,   6],
       [  0,   0,  12,  98,   2],
       [  0,   8,  12,   5,  94]], dtype=int64)

In [16]:
predicciones = rf.predict(test_nolabel)

KeyError: "['Smog'] not in index"

In [None]:
#knn-

In [644]:
index_0 = smog[smog==0].index
index_1 = smog[smog==1].index
index_2 = smog[smog==2].index
index_3 = smog[smog==3].index
index_4 = smog[smog==4].index


In [645]:
print(len(index_0))
print(len(index_1))
print(len(index_2))
print(len(index_3))
print(len(index_4))

33
125
197
112
119


In [648]:
0.60*125

75.0

In [701]:
#quiero coger valores aleatorios de index_0
import random
import numpy as np
random.seed(80)

index_0 = random.sample(list(index_0), 33)
index_1 = random.sample(list(index_1), int(np.round(125*0.60)))
index_2 = random.sample(list(index_2), int(np.round(197*0.60)))
index_3 = random.sample(list(index_3), int(np.round(112*0.60)))
index_4 = random.sample(list(index_4), int(np.round(119*0.60)))


In [696]:
#eliminar make
train = train.drop(['Make'], axis=1)

In [714]:
X_train = train.iloc[index_0+index_1+index_2+index_3+index_4]
y_train = smog.iloc[index_0+index_1+index_2+index_3+index_4]

In [703]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)#cuantos mas splits mejor ya 

In [704]:
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())

In [720]:
X_train

Unnamed: 0,Vehicle Class,Engine Size (L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Hwy (L/100 km),CO2 Emissions (g/km)
129,Two-seater,2.0,4,AM7,Z,11.0,8.7,233
278,Two-seater,2.0,4,AM7,Z,11.2,8.7,236
69,Two-seater,5.2,10,AM7,Z,17.9,12.1,356
348,Mid-size,6.2,8,A8,Z,18.6,11.3,359
337,Mid-size,3.8,8,A8,Z,17.4,11.9,348
...,...,...,...,...,...,...,...,...
493,Subcompact,1.5,3,AM7,Z,8.2,6.2,170
216,Mid-size,2.0,4,M5,X,10.1,7.5,209
508,SUV: Small,2.0,4,AS8,X,11.4,8.3,233
325,SUV: Small,1.5,4,A9,X,9.2,7.8,202


In [721]:
clf.fit(train,test_nolabel)

TypeError: '<' not supported between instances of 'int' and 'str'

In [688]:
X_test = train.drop(index_0+index_1+index_2+index_3+index_4)
y_test = smog.drop(index_0+index_1+index_2+index_3+index_4)

In [689]:
y_pred = clf.predict(X_test)

In [690]:
y_pred 

array([2, 4, 4, 4, 3, 3, 2, 2, 2, 3, 2, 2, 3, 2, 4, 1, 3, 3, 2, 3, 4, 1,
       1, 2, 2, 1, 1, 3, 1, 4, 1, 3, 4, 4, 1, 4, 4, 2, 2, 3, 0, 3, 2, 4,
       1, 2, 2, 3, 2, 3, 3, 1, 3, 2, 4, 4, 4, 1, 3, 4, 4, 1, 3, 2, 1, 4,
       2, 2, 2, 3, 1, 3, 4, 2, 2, 1, 2, 3, 4, 2, 1, 4, 2, 2, 4, 2, 4, 1,
       4, 3, 4, 1, 3, 2, 2, 2, 3, 4, 3, 4, 3, 3, 2, 4, 1, 2, 2, 1, 4, 1,
       2, 2, 1, 3, 2, 3, 1, 2, 2, 4, 4, 4, 1, 2, 3, 2, 2, 3, 2, 2, 3, 2,
       4, 2, 1, 4, 1, 1, 4, 2, 1, 2, 3, 0, 2, 2, 4, 2, 4, 2, 3, 4, 1, 3,
       3, 3, 1, 4, 3, 2, 2, 3, 2, 2, 2, 3, 2, 1, 2, 4, 2, 3, 2, 2, 2, 3,
       2, 2, 1, 2, 3, 1, 3, 2, 4, 2, 3, 2, 3, 1, 2, 2, 3, 1, 2, 1, 1, 1,
       2, 3, 4, 2, 4, 4, 2, 3, 2, 4, 1, 2, 3, 3, 2, 1, 1, 4, 3, 3, 4, 2,
       1, 1], dtype=int64)

In [691]:
f1_score(y_test, y_pred, average='macro')

0.5911142666094998

In [692]:
predicciones = clf.predict(test_nolabel)

In [694]:
test = test_nolabel.copy()
test['id'] = id

csv(predicciones,'balanceo.csv',test)

In [662]:
confusion_matrix(y_test, y_pred)    

array([[ 0,  0,  0,  0,  0],
       [ 2, 35, 10,  0,  3],
       [ 0,  6, 56, 10,  7],
       [ 0,  0,  2, 40,  3],
       [ 0,  2,  7,  3, 36]], dtype=int64)