# Selección de variables


In [69]:
import pandas as pd
import numpy as np

import missingno as msno
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import datetime
from dateutil.parser import parse
from sklearn.tree import  DecisionTreeClassifier
import plotly.graph_objects as go
import umap
from imblearn.over_sampling import RandomOverSampler

In [70]:
# Seteo jupyter notebook
pd.set_option('display.max_columns', None)

In [71]:
Data=pd.read_csv("cleaned_05June2022.csv").drop(columns=["Unnamed: 0"]).fillna("")



Columns (36) have mixed types.Specify dtype option on import or set low_memory=False.



In [72]:
# datos categorical
categorical=list(Data.select_dtypes(include=["category","object"]).columns)

In [73]:
# Replace valores categoricos por númericos
for i in categorical:
    dic={}
    k=0
    for j in Data[i].unique():
        k=k+1
        dic[j]=k
    Data[i].replace(dic,inplace=True)

In [74]:
# Features - labels

features = Data.drop(columns=["Target"]).copy() 
labels = Data.loc[:,"Target"].copy()

# Generación de conjuntos de train y test
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.33,random_state=50)


**Normalización de variables**

In [75]:
# Columns transformer
preprocessing_transformer = ColumnTransformer(
    transformers=[
        ('MinMaxScaler',MinMaxScaler(),
         features.columns),
    ])

pipe = Pipeline(
    [("preprocesamiento", preprocessing_transformer)]
)

In [76]:
# Entrenamiento de pípeline
X_train=pd.DataFrame(pipe.fit_transform(X_train),columns=features.columns)

In [77]:
# Sobremuestreo: Aumentar numero de datos de la clase minoritaria
# ROS(Duplica clases):
ros=RandomOverSampler(random_state=0) # Random_state=0

In [78]:
# ROS:
X_train, y_train=ros.fit_resample(X_train,y_train)

In [79]:
# Según el aporte de información con el criterio gini de un arbol de decisión
fs = DecisionTreeClassifier(criterion='gini', max_depth=10, random_state=1)
fs.fit(X_train, y_train.astype(int))

DecisionTreeClassifier(max_depth=10, random_state=1)

In [80]:
var_num = 30
table = pd.DataFrame({'Tags': features.columns, 'Value': fs.feature_importances_})
table['Abs_Value'] = table['Value'].abs()
table = table[table['Abs_Value']>0].sort_values(by='Abs_Value', ascending=False, ignore_index=True)
tree_fs = table.loc[:var_num, 'Tags'].to_list()

In [81]:
# Plot
fig = go.Figure([go.Bar(x=table["Tags"], y=table["Abs_Value"])])

fig.update_layout(
    title_text="Importancia de variables",
)

fig.show()

 **Matriz de correlación**

In [82]:
correlation=(X_train.join(y_train)).corr()

In [83]:
aux_corr=pd.DataFrame(correlation["Target"].sort_values(ascending=False)).query("index!='Target'")

In [84]:
aux_corr

Unnamed: 0,Target
C-C,0.176361
Positivo,0.158486
D-F,0.149808
Principalidad,0.122645
Ciudad,0.115978
A-K,0.110405
lec_C-D,0.10333
num_comunicaciones,0.093075
B,0.087074
Frequency,0.081547


In [85]:
aux_corr.query("Target>0.1 and index!='Target'").index

Index(['C-C', 'Positivo', 'D-F', 'Principalidad', 'Ciudad', 'A-K', 'lec_C-D'], dtype='object')

In [86]:
# Corr
correlations = X_train.corr()

fig=px.imshow(correlations,
          labels=dict(x="", y="", color="Correlation"),
          x=X_train.columns,
          y=X_train.columns,
          zmin=-1,
          zmax=1,
          color_continuous_scale="Inferno")

fig.layout.height = 1000
fig.layout.width = 1000

fig.show()

In [87]:
def corrkill(dataframe, features, corr_cut=0.6):

    df = dataframe[features]
    dfcorr = pd.DataFrame(np.triu(df.corr()), columns=df.columns, index=df.columns)
    dfcorr = dfcorr.stack().reset_index()
    dfcorr.columns = ['Feat1','Feat2','Val']
    dfcorr = dfcorr[~dfcorr['Val'].isin([0,1])]
    dfcorr = dfcorr[dfcorr['Val'].abs()>corr_cut]
    
    return list(dfcorr['Feat2']), dfcorr

In [88]:
a,b=corrkill(X_train,X_train.columns,corr_cut=0.5)

In [89]:
b.sort_values('Val',ascending=False)

Unnamed: 0,Feat1,Feat2,Val
455,Frequency,Negativo,0.996173
161,lec_E-E,lec_D-E,0.895741
81,C,lec_B-B,0.827984
449,Frequency,A-K,0.650428
806,A-K,Negativo,0.641857
728,G-K,Negativo,0.638654
571,F-D,Positivo,0.631029
447,Frequency,G-K,0.62177
688,F-J,Positivo,0.585813
83,C,lec_D-E,0.570245


In [90]:
collinearity, table_cor=corrkill(features, tree_fs)
tag_select = [i for i in tree_fs if i not in collinearity]
tags_view_corr = features[tag_select].copy().reset_index(drop=True)

In [91]:
tag_select

['E-F',
 'D-F',
 'C-C',
 'A-K',
 'A-G',
 'num_comunicaciones',
 'Recency',
 'Positivo',
 'B',
 'Monto',
 'Periodicity',
 'Profesion',
 'lec_B-B',
 'Renta',
 'F-I',
 'Edad',
 'lec_A-A',
 'Comuna',
 'Sin Info',
 'Ciudad',
 'Duracion_Campaña',
 'Principalidad',
 'F-H',
 'lec_C-D']

In [68]:
# Corr
correlations = tags_view_corr.corr()

fig=px.imshow(correlations,
          labels=dict(x="", y="", color="Correlation"),
          x=tag_select,
          y=tag_select,
          zmin=-1,
          zmax=1,
          color_continuous_scale="Inferno")

fig.layout.height = 700
fig.layout.width = 700
fig.update_xaxes(
        tickangle = 45)

fig.show()

# Visualización de baja dimensionalidad

In [45]:
Data=pd.read_csv("cleaned_05June2022.csv").drop(columns=["Unnamed: 0"]).fillna("")[tag_select+["Target"]]


Columns (36) have mixed types.Specify dtype option on import or set low_memory=False.



In [46]:
# datos númericos
numerical=list(Data.select_dtypes(exclude=["category","object"]).columns)



# datos categorical
categorical=list(Data.select_dtypes(include=["category","object"]).columns)
categorical.remove("Target")


In [47]:
for i in categorical:
    Data[i]=Data[i].astype(str)

In [48]:
# Pipeline provisorio para pre-procesar los datos y luego realizar reducción de dimensionalidad

# Columns transformer
preprocessing_transformer = ColumnTransformer(
    transformers=[
        ('OneHotEncoder', OneHotEncoder(handle_unknown="ignore"),  
          categorical),
        ('StandardScaler', MinMaxScaler(),
         numerical),
    ])

#Pipelines

pipe_process_2d_umap = Pipeline(
    [("preprocesamiento", preprocessing_transformer),("reducción de dimensionalidad",
                                                      umap.UMAP(n_components=2, random_state=42))]
)

pipe_process_3d_umap = Pipeline(
    [("preprocesamiento", preprocessing_transformer),("reducción de dimensionalidad",
                                                      umap.UMAP(n_components=3, random_state=42))]
)

In [49]:
# Dataframes de baja dimensionalidad

data_process_2d_umap=pd.DataFrame(pipe_process_2d_umap.fit_transform(Data),columns=['x_umap','y_umap'])
data_process_3d_umap=pd.DataFrame(pipe_process_3d_umap.fit_transform(Data),columns=['x_umap','y_umap','z_umap'])


[1mThe TBB threading layer requires TBB version 2021 update 1 or later i.e., TBB_INTERFACE_VERSION >= 12010. Found TBB_INTERFACE_VERSION = 11103. The TBB threading layer is disabled.[0m



In [50]:
Data["Target"].replace("","Nada",inplace=True)

In [51]:
data_process_2d_umap["Target"]=Data["Target"]
data_process_3d_umap["Target"]=Data["Target"]


In [52]:
# estimated_sells
px.scatter(data_process_2d_umap, x='x_umap', y='y_umap',color="Target", title='2d UMAP')

In [53]:
# estimated_sells
fig = px.scatter_3d(
    data_process_3d_umap,
    x='x_umap',
    y='y_umap',
    z='z_umap',
    color="Target",
    title='3d UMAP'
)
fig.show()