In [1]:
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
def preprocess_dataframe(df):

  df.fillna(value=0, inplace=True) #Reemplazamos NAN por 0, ya que NAN rompe a Tensorflow

  #Renombramos las columnas que tienen caracteres que TensorFlow no acepta como validos.
  #Estos particularmente son whitespace, coma y parentesis por ejemplo.
  df.rename(columns={'ASP_(converted)':'ASP_converted','Pricing, Delivery_Terms_Quote_Appr':
                    'Pricing_Delivery_Terms_Quote_Appr','Pricing, Delivery_Terms_Approved':
                    'Pricing_Delivery_Terms_Approved','Source ':'Source'},inplace=True)

  df = df[df['Stage'].isin(['Closed Won', 'Closed Lost'])]
  df.loc[:, 'Stage'].replace({'Closed Won':1, 'Closed Lost':0}, inplace=True) #0 corresponde a que el caso fue Closed Lost, 1 a que fue Closed Won. Asi tenemos un problema de clasificacion binario que puede entender la red neuronal.

  df.loc[:, 'Planned_Delivery_Start_Date'] = pd.to_datetime(df['Planned_Delivery_Start_Date'], 'coerce',
                                                                  format='%m/%d/%Y')
  df.loc[:, 'Planned_Delivery_End_Date'] = pd.to_datetime(df['Planned_Delivery_End_Date'], 'coerce',
                                                                                      format='%m/%d/%Y')
  df = df[df['Opportunity_ID'] != 9773] #Hardcodeo este filtrado porque el id 9773 tiene mal cargada la fecha de delviery end, dando una diferencia de 200 anios xd"

  #Pongo .loc porque pandas me jode con warnings que son falsos positivos de slice copy"
  #Gracias Pandas!"

  #Creamos una nueva columna (Feature Engineering) que contiene la longitud en dias 
  #estimada de la operacion. En el informe habiamos encontrado que aparentaba haber
  #una relacion cuadratica de decrecimiento a medida que aumentaban los dias donde disminuia
  #la chance de completar la operacion.
  df['Delta_Time'] = df['Planned_Delivery_End_Date'] - df['Planned_Delivery_Start_Date']
  df.loc[:, 'Delta_Time'] = df['Delta_Time'].dt.days
  df['Delta_Time'] = df.groupby('Opportunity_ID')['Delta_Time'].transform('max')

  #Pasamos todo a dolares
  currency_conversion = {'AUD':0.707612, 'EUR':1.131064, 'GBP':1.318055, 'JPY':0.008987, 'USD':1.0}
  df['Total_Taxable_Amount_Currency'] = df[['Total_Taxable_Amount_Currency']].replace(currency_conversion)
  df['Total_Taxable_Amount'] = df['Total_Taxable_Amount_Currency'] * df['Total_Taxable_Amount']

  #Modifico la columna Brand para que en vez de decir que marca es, solo diga
  #si tiene o no marca
  df.loc[df['Brand'] == 'None', 'Brand'] = 0
  df.loc[df['Brand'] != 0, 'Brand'] = 1

  #Agrego una columna que indica si tiene o no numero de contrato
  df.loc[:, 'Sales_Contract_No'][df['Sales_Contract_No'] != 'None'] = 1
  df.loc[:, 'Sales_Contract_No'][df['Sales_Contract_No'] == 'None'] = 0
  df.rename(columns={'Sales_Contract_No':'Has_Contract_Number'}, inplace=True)

  #Agrego una columna que indique la cantidad de productos que tiene esa
  #oportunidad
  df['Product_Name'] = 1
  df['Product_Amount'] = df.groupby('Opportunity_ID')['Product_Name'].transform(lambda x: x.sum())

  #Agrego una columna que indica si el owner de la cuenta es el mismo que el de la oportunidad
  #o no
  df['Same_Owner'] = (df['Account_Owner'] == df['Opportunity_Owner'])
  df['Same_Owner'] = df['Same_Owner'].replace({False:0, True:1})


  #Cambio TRF por una columna que es la suma de los TRF de la oportunidad
  df["TRF"] = df.groupby("Opportunity_ID")["TRF"].transform("mean")


  #Pruebo volar duplicados, solo cambia el producto. Si el producto no importa
  #entonces volar duplicados no deberia importar. Obviamente vuelo el producto en el que
  #quede tambien.
  df.drop_duplicates('Opportunity_ID',inplace=True)
  df.drop(columns=['Product_Name','Product_Family','Opportunity_Name'],inplace=True)


  #Normalizo las columnas numericas
  normalized_columns = ['ASP_converted','TRF','Total_Taxable_Amount', 'Product_Amount']
  for column in normalized_columns:
    df[column] = (df[column] - df[column].mean()) / df[column].std()

  #Borro columnas que tengan el mismo dato en todas las entradas, o inconsecuentes como el ID / Opportunity_ID
  #Algunas columnas borradas son porque pienso que no tienen incidencia, ir viendo.
  #TODO: Analizar si el Sales_Contract_No no es que importe el numero en si, sino si tiene
  #o no tiene numero de contrato. Por ahora no lo meto como input.
  #TODO: Ver el mismo tema con la columna 'Price', la mayoria tiene None u Other
  #y solo unos pocos tienen precio numerico. Quiza importe que tenga precio o no tenga,
  #o si no tiene precio quiza importe si es None u Other. Por ahora no lo pongo
  #como input.
  df.drop(columns=['Submitted_for_Approval', 'Last_Activity', 'ASP_(converted)_Currency', 
                  'Prod_Category_A', 'ID', 'Opportunity_ID'],inplace=True)

  #Drop columnas que quiza podamos usar pero por ahora no las uso
  df.drop(columns=['Account_Created_Date','Opportunity_Created_Date',
                  'Quote_Expiry_Date','Last_Modified_Date',
                  'Planned_Delivery_Start_Date','Planned_Delivery_End_Date',
                  'Month','Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date',
                  'Price','ASP','ASP_Currency','Total_Amount_Currency',
                  'Total_Amount','Total_Taxable_Amount_Currency','Currency',
                   'Product_Category_B','Last_Modified_By', 'Account_Owner',
                   'Opportunity_Owner','Account_Name','Product_Type','Size',
                   'Territory', 'Billing_Country'],
                   inplace=True)

  #Definimos que tipo de feature es cada columna

  #Debemos separar algunos de los registros para armar un set de test propio (no el de la catedra). De esta forma sabremos rapidamente
  #si nuestro modelo esta dando resultados optimos o no sin necesidad de estar subiendo el TP a Kaggle constantemente.
  #Sin embargo, no queremos usar tantos registros ya que estariamos disminuyendo el set de entrenamiento considerablemente.
  #Podemos empezar reservando 2000 registros para el test de prueba y ver que onda. Pasariamos de tener 16 mil a 14 mil 
  #registros para el set de entrenamiento, no es una perdida importantisima creo en principio, asi que arrancamos con eso.

  #Por otro lado, nuestro test de prueba deberia tener un 50 50 de Closed Won y Closed Lost, por lo que no podemos elegir asi nomas
  #al azar.

  return df

In [3]:
#Aca hago el "train" sin train/test split

df = pd.read_csv("Train_TP2_Datos_2020-2C.csv") 
df = preprocess_dataframe(df)
df.head(10)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Region,Pricing_Delivery_Terms_Quote_Appr,Pricing_Delivery_Terms_Approved,Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Bureaucratic_Code,Source,Has_Contract_Number,Account_Type,Opportunity_Type,Quote_Type,Delivery_Terms,Brand,ASP_converted,TRF,Total_Taxable_Amount,Stage,Delta_Time,Product_Amount,Same_Owner
0,EMEA,1,1,1,1,Bureaucratic_Code_4,,0,Account_Type_2,Opportunity_Type_1,Non Binding,Delivery_Terms_2,0,0.215056,0.482815,0.547199,0,60.0,-0.427738,0
1,EMEA,0,0,0,0,Bureaucratic_Code_4,,1,Account_Type_2,Opportunity_Type_1,Non Binding,Delivery_Terms_2,0,0.227577,-0.204252,-0.172044,1,2.0,-0.427738,0
2,Americas,0,0,0,0,Bureaucratic_Code_4,Source_7,1,Account_Type_5,Opportunity_Type_1,Non Binding,Delivery_Terms_4,0,0.095303,-0.204252,-0.168476,1,0.0,-0.427738,0
3,Americas,1,0,1,0,Bureaucratic_Code_5,Source_11,0,Account_Type_5,Opportunity_Type_19,Non Binding,Delivery_Terms_1,1,0.150657,0.757642,0.724658,0,58.0,-0.427738,1
4,Americas,1,0,1,0,Bureaucratic_Code_5,Source_11,0,Account_Type_5,Opportunity_Type_19,Non Binding,Delivery_Terms_1,1,0.150657,1.513417,1.447064,0,27.0,-0.427738,1
5,Americas,1,0,1,0,Bureaucratic_Code_5,Source_11,0,Account_Type_5,Opportunity_Type_19,Non Binding,Delivery_Terms_1,1,0.150657,1.719537,1.627332,0,58.0,-0.427738,1
6,Americas,1,0,1,0,Bureaucratic_Code_5,Source_11,0,Account_Type_5,Opportunity_Type_19,Non Binding,Delivery_Terms_4,0,-0.015405,0.276695,0.145012,0,89.0,-0.427738,1
7,Americas,1,1,1,1,Bureaucratic_Code_4,Source_7,0,Account_Type_5,Opportunity_Type_1,Non Binding,Delivery_Terms_4,0,0.095303,-0.204252,-0.172514,1,0.0,-0.427738,0
8,Japan,1,0,0,0,Bureaucratic_Code_5,,0,Account_Type_2,Opportunity_Type_7,Non Binding,Delivery_Terms_4,0,-0.436097,-0.204252,-0.178169,0,0.0,1.934778,1
13,Japan,1,0,0,0,Bureaucratic_Code_5,,0,Account_Type_2,Opportunity_Type_7,Non Binding,Delivery_Terms_4,0,-0.436097,-0.204252,-0.178072,0,0.0,1.934778,1


In [4]:
#creating labelEncoder
#le = preprocessing.LabelEncoder()
categorical_columns = ["Region", "Bureaucratic_Code", "Source", "Account_Type",
                       "Opportunity_Type", "Quote_Type", "Delivery_Terms"]

#for feature in categorical_columns:
#  df[feature] = le.fit_transform(df[feature])

#df.head(10)

In [5]:
#OneHot encoding

enc = preprocessing.OneHotEncoder()#handle_unknown='ignore')
#Creo un df de cada columna categorica y lo joineo con el df original

no_enc_df = df.copy()

for feature in categorical_columns:
  enc.fit(df[[feature]])
  enc_df = pd.DataFrame(enc.transform(df[[feature]]).toarray())
  enc_df = enc_df.add_prefix(feature + "_")
  df = df.join(enc_df)

df.drop(columns=categorical_columns,inplace=True)

df.fillna(0, inplace=True)

df.head(10)

Unnamed: 0,Pricing_Delivery_Terms_Quote_Appr,Pricing_Delivery_Terms_Approved,Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Has_Contract_Number,Brand,ASP_converted,TRF,Total_Taxable_Amount,Stage,Delta_Time,Product_Amount,Same_Owner,Region_0,Region_1,Region_2,Region_3,Region_4,Bureaucratic_Code_0,Bureaucratic_Code_1,Bureaucratic_Code_2,Bureaucratic_Code_3,Bureaucratic_Code_4,Bureaucratic_Code_5,Bureaucratic_Code_6,Source_0,Source_1,Source_2,Source_3,Source_4,Source_5,Source_6,Source_7,Source_8,Source_9,Source_10,Source_11,Source_12,Source_13,Account_Type_0,...,Account_Type_3,Account_Type_4,Account_Type_5,Account_Type_6,Opportunity_Type_0,Opportunity_Type_1,Opportunity_Type_2,Opportunity_Type_3,Opportunity_Type_4,Opportunity_Type_5,Opportunity_Type_6,Opportunity_Type_7,Opportunity_Type_8,Opportunity_Type_9,Opportunity_Type_10,Opportunity_Type_11,Opportunity_Type_12,Opportunity_Type_13,Opportunity_Type_14,Opportunity_Type_15,Opportunity_Type_16,Opportunity_Type_17,Opportunity_Type_18,Opportunity_Type_19,Opportunity_Type_20,Opportunity_Type_21,Opportunity_Type_22,Opportunity_Type_23,Opportunity_Type_24,Quote_Type_0,Quote_Type_1,Delivery_Terms_0,Delivery_Terms_1,Delivery_Terms_2,Delivery_Terms_3,Delivery_Terms_4,Delivery_Terms_5,Delivery_Terms_6,Delivery_Terms_7,Delivery_Terms_8
0,1,1,1,1,0,0,0.215056,0.482815,0.547199,0,60.0,-0.427738,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,1,0,0.227577,-0.204252,-0.172044,1,2.0,-0.427738,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,1,0,0.095303,-0.204252,-0.168476,1,0.0,-0.427738,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1,0,1,0,0,1,0.150657,0.757642,0.724658,0,58.0,-0.427738,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,1,0,0,1,0.150657,1.513417,1.447064,0,27.0,-0.427738,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,0,1,0,0,1,0.150657,1.719537,1.627332,0,58.0,-0.427738,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1,0,1,0,0,0,-0.015405,0.276695,0.145012,0,89.0,-0.427738,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,1,1,1,1,0,0,0.095303,-0.204252,-0.172514,1,0.0,-0.427738,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,1,0,0,0,0,0,-0.436097,-0.204252,-0.178169,0,0.0,1.934778,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
13,1,0,0,0,0,0,-0.436097,-0.204252,-0.178072,0,0.0,1.934778,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [6]:
target = df["Stage"]
features = df.values

#Estas lineas hacen la estandarizacion a TODAS las columnas, por lo que esta mal, pero por alguna razon dan mejor que si solo normalizo las numericas
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)
#X_test = scaler.transform(X_test)

model = KNeighborsClassifier(n_neighbors=98, algorithm="kd_tree", weights="distance")#Ver que algoritmo usar y los otros parametros
model.fit(features,target)

KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=98, p=2,
                     weights='distance')

In [7]:
frio_test_df = pd.read_csv('Test_TP2_Datos_2020-2C.csv')
frio_test_df['Stage'] = 'Closed Won' #Esto esta solo para que funque todo, no lo uso. No se bien como armarlo sin los labels de Stage. TODO: Averiguar como es!
aux_df = frio_test_df[['Opportunity_ID']] #Esta columna la vuela el preprocesado sino
aux_df.drop_duplicates(subset='Opportunity_ID', inplace=True)

frio_test_df = preprocess_dataframe(frio_test_df)

frio_test_df.reset_index(inplace = True)
frio_test_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,index,Region,Pricing_Delivery_Terms_Quote_Appr,Pricing_Delivery_Terms_Approved,Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Bureaucratic_Code,Source,Has_Contract_Number,Account_Type,Opportunity_Type,Quote_Type,Delivery_Terms,Brand,ASP_converted,TRF,Total_Taxable_Amount,Stage,Delta_Time,Product_Amount,Same_Owner
0,0,EMEA,1,1,1,1,Bureaucratic_Code_4,Source_7,1,Account_Type_0,Opportunity_Type_8,Non Binding,Delivery_Terms_2,0,0.005849,-0.085748,-0.063155,1,30.0,0.836994,1
1,3,EMEA,1,1,1,1,Bureaucratic_Code_4,Source_7,1,Account_Type_0,Opportunity_Type_8,Non Binding,Delivery_Terms_2,0,0.005849,-0.080264,-0.038493,1,28.0,2.057060,1
2,8,Americas,1,1,0,0,Bureaucratic_Code_4,Source_9,1,Account_Type_2,Opportunity_Type_7,Non Binding,Delivery_Terms_4,0,0.061825,-0.085748,-0.085193,1,0.0,-0.383071,1
3,9,Americas,1,1,1,0,Bureaucratic_Code_2,Source_11,1,Account_Type_0,Opportunity_Type_1,Non Binding,Delivery_Terms_4,0,0.061825,-0.058329,0.034793,1,0.0,2.667092,1
4,15,Americas,0,0,0,0,Bureaucratic_Code_4,Source_9,1,Account_Type_0,Opportunity_Type_7,Non Binding,Delivery_Terms_4,0,0.027687,-0.085748,-0.086047,1,0.0,-0.383071,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2545,EMEA,1,1,0,0,Bureaucratic_Code_4,Source_3,1,Account_Type_0,Opportunity_Type_1,Non Binding,Delivery_Terms_7,0,-0.060095,-0.085748,-0.078115,1,9.0,0.226962,1
1563,2547,Americas,1,1,0,0,Bureaucratic_Code_4,,1,Account_Type_2,Opportunity_Type_1,Non Binding,Delivery_Terms_4,0,0.047195,-0.085748,-0.083852,1,91.0,-0.383071,1
1564,2548,Americas,1,1,1,1,Bureaucratic_Code_4,,1,Account_Type_2,Opportunity_Type_1,Non Binding,Delivery_Terms_4,0,0.047195,-0.085748,-0.080776,1,91.0,-0.383071,1
1565,2549,Americas,1,1,0,0,Bureaucratic_Code_4,,0,Account_Type_2,Opportunity_Type_7,Non Binding,Delivery_Terms_4,0,0.134977,-0.085748,-0.078369,1,0.0,-0.383071,1


In [8]:
#OneHot
#enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
#Creo un df de cada columna categorica y lo joineo con el df original

for feature in categorical_columns:
  enc.fit(no_enc_df[[feature]])
  enc_df = pd.DataFrame(enc.transform(frio_test_df[[feature]]).toarray())
  enc_df = enc_df.add_prefix(feature + "_")
  frio_test_df = frio_test_df.join(enc_df)

frio_test_df.fillna(0,inplace=True)

frio_test_df.drop(columns=categorical_columns + ["index"],inplace=True)#Ver si dropeo brand

frio_test_df

Unnamed: 0,Pricing_Delivery_Terms_Quote_Appr,Pricing_Delivery_Terms_Approved,Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Has_Contract_Number,Brand,ASP_converted,TRF,Total_Taxable_Amount,Stage,Delta_Time,Product_Amount,Same_Owner,Region_0,Region_1,Region_2,Region_3,Region_4,Bureaucratic_Code_0,Bureaucratic_Code_1,Bureaucratic_Code_2,Bureaucratic_Code_3,Bureaucratic_Code_4,Bureaucratic_Code_5,Bureaucratic_Code_6,Source_0,Source_1,Source_2,Source_3,Source_4,Source_5,Source_6,Source_7,Source_8,Source_9,Source_10,Source_11,Source_12,Source_13,Account_Type_0,...,Account_Type_3,Account_Type_4,Account_Type_5,Account_Type_6,Opportunity_Type_0,Opportunity_Type_1,Opportunity_Type_2,Opportunity_Type_3,Opportunity_Type_4,Opportunity_Type_5,Opportunity_Type_6,Opportunity_Type_7,Opportunity_Type_8,Opportunity_Type_9,Opportunity_Type_10,Opportunity_Type_11,Opportunity_Type_12,Opportunity_Type_13,Opportunity_Type_14,Opportunity_Type_15,Opportunity_Type_16,Opportunity_Type_17,Opportunity_Type_18,Opportunity_Type_19,Opportunity_Type_20,Opportunity_Type_21,Opportunity_Type_22,Opportunity_Type_23,Opportunity_Type_24,Quote_Type_0,Quote_Type_1,Delivery_Terms_0,Delivery_Terms_1,Delivery_Terms_2,Delivery_Terms_3,Delivery_Terms_4,Delivery_Terms_5,Delivery_Terms_6,Delivery_Terms_7,Delivery_Terms_8
0,1,1,1,1,1,0,0.005849,-0.085748,-0.063155,1,30.0,0.836994,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,1,1,1,0,0.005849,-0.080264,-0.038493,1,28.0,2.057060,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1,0,0,1,0,0.061825,-0.085748,-0.085193,1,0.0,-0.383071,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1,1,1,0,1,0,0.061825,-0.058329,0.034793,1,0.0,2.667092,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0,0,0,0,1,0,0.027687,-0.085748,-0.086047,1,0.0,-0.383071,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,1,1,0,0,1,0,-0.060095,-0.085748,-0.078115,1,9.0,0.226962,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1563,1,1,0,0,1,0,0.047195,-0.085748,-0.083852,1,91.0,-0.383071,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1564,1,1,1,1,1,0,0.047195,-0.085748,-0.080776,1,91.0,-0.383071,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1565,1,1,0,0,0,0,0.134977,-0.085748,-0.078369,1,0.0,-0.383071,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
features2 = frio_test_df.values

scaler.fit(features2)
features2 = scaler.transform(features2)

predictions = model.predict_proba(features2)

aux_df['Target'] = predictions[:,1]

aux_df.to_csv('prediccionesFrioFrioSinTest.csv', index=False)

In [10]:
aux_df

Unnamed: 0,Opportunity_ID,Target
0,10689,0.660109
3,10690,0.724120
8,10691,0.968648
9,10692,0.623974
15,10693,0.925301
...,...,...
2545,12364,0.575878
2547,12365,0.750203
2548,12366,0.666440
2549,12367,0.603004
