In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [28]:
df = pd.read_csv('df_para_modelo.csv')

In [29]:
df.shape

(410000, 29)

In [30]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

## Estandarizacion y  Reduccion de dimencion con PCA

In [31]:
#normalizamos los datos
scaler=StandardScaler()
scaler.fit(X) # calculo la media para poder hacer la transformacion
X_scaled=scaler.transform(X)# Ahora si, escalo los datos y los normalizo

# diminusco la dimencion percevando un 98% de varianza
pca = PCA(0.98)
X_reduc = pca.fit_transform(X_scaled) 

In [32]:
expl = len(pca.explained_variance_ratio_)
print(expl)

23


### Divido train y test

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_reduc, y, test_size=0.30, random_state=42)

### Benchmark

In [34]:
print('El benchmark es:',y.sum()/y.count())

El benchmark es: 0.6229658536585366


Veo si se mantiene la distribucion

In [35]:
print('El benchmark es:',y_train.sum()/y_train.count())
print('El benchmark es:',y_test.sum()/y_test.count())

El benchmark es: 0.6232508710801393
El benchmark es: 0.6223008130081301


## Bagging

Elijo este modelo para poder prevenir el overfitting que pued generar la grna cantidad de varibles que tinen mi modelo

In [36]:
from sklearn.ensemble import BaggingClassifier

In [37]:
bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap_features=False, n_estimators= 100, n_jobs = -1)
bagging.fit(X_train, y_train)

## Evaluacion

In [40]:
def evaluar_modelo(model):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    precision = recall_score(y_test, y_test_pred, average= 'micro')
    
    # Evaluamos
    print('Accuracy en train:',accuracy_score(y_train, y_train_pred))
    print('Accuracy en test:', accuracy_score(y_test, y_test_pred))
    
    print('Recall :',precision)

In [41]:
evaluar_modelo(bagging)

Accuracy en train: 0.9999930313588851
Accuracy en test: 0.6824959349593496
Recall : 0.6824959349593496


## Test

In [19]:
df_test = pd.read_csv('df_para_pred.csv')

In [20]:
df_test.shape

(90000, 37)

In [21]:
def crear_pred(modelo):
    scaler=StandardScaler()
    scaler.fit(df_test) # calculo la media para poder hacer la transformacion
    X_to_pred=scaler.transform(df_test)# Ahora si, escalo los datos y los normalizo
    
    pca_test = PCA(expl)#n_components= 28
    X_df_test = pca_test.fit_transform(X_to_pred)   

    pred = modelo.predict(X_df_test)
    pred = pd.DataFrame({'pred': pred})

    pred.to_csv('Heizhem.csv',index=False)

In [22]:
crear_pred(bagging)

# Modelos 2

## GridSearch

In [12]:
param_tree = {'ccp_alpha': [0.1, 0.01, 0.001],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 6, 7, 8, 9],
                         'max_features': ['auto', 'sqrt', 'log2']}

In [13]:
param_svm = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=42) 
model = GridSearchCV(cv=5, error_score= np.nan,
             estimator=clf,
             param_grid=param_tree,
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=True)

In [49]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits




In [50]:
model.best_estimator_

In [51]:
print("Mejores hiperparámetros: "+str(model.best_params_))
print("Mejor Score: "+str(model.best_score_)+'\n')

Mejores hiperparámetros: {'ccp_alpha': 0.001, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'log2'}
Mejor Score: 0.6309860627177701



### Decision Tree

In [52]:
tree = DecisionTreeClassifier(ccp_alpha=0.001, max_depth=8, max_features='log2',
                       random_state=42)

In [53]:
tree.fit(X_train, y_train)

## Random Forest

In [56]:
from sklearn.ensemble import RandomForestClassifier

random_tre = RandomForestClassifier(n_estimators=100, max_features='sqrt', n_jobs=-1, oob_score = True, random_state = 42)
random_tre.fit(X_train,y_train)

## Booosting

In [39]:
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

In [40]:
clf_xgb = xgb.XGBClassifier(n_estimators=150,
                            seed=42, use_label_encoder= False)
                            
# Entrenamos
clf_xgb.fit(X_train, y_train)



## SVM

In [47]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', random_state=0)
svm.fit(X_train, y_train)

## KMeans

In [45]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=2, random_state=0)
km.fit(X_train)

## KNN

In [43]:
from sklearn.neighbors import KNeighborsClassifier
# Instanciamos un objeto de la clase KNeighborsClassifier

knn = KNeighborsClassifier()

# Entrenamos

knn.fit(X_train,y_train)