In [1]:
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
def preprocess_dataframe(df):

  df.fillna(value=0, inplace=True) #Reemplazamos NAN por 0, ya que NAN rompe a Tensorflow

  #Renombramos las columnas que tienen caracteres que TensorFlow no acepta como validos.
  #Estos particularmente son whitespace, coma y parentesis por ejemplo.
  df.rename(columns={'ASP_(converted)':'ASP_converted','Pricing, Delivery_Terms_Quote_Appr':
                    'Pricing_Delivery_Terms_Quote_Appr','Pricing, Delivery_Terms_Approved':
                    'Pricing_Delivery_Terms_Approved','Source ':'Source'},inplace=True)

  df = df[df['Stage'].isin(['Closed Won', 'Closed Lost'])]
  df.loc[:, 'Stage'].replace({'Closed Won':1, 'Closed Lost':0}, inplace=True) #0 corresponde a que el caso fue Closed Lost, 1 a que fue Closed Won. Asi tenemos un problema de clasificacion binario que puede entender la red neuronal.

  df.loc[:, 'Planned_Delivery_Start_Date'] = pd.to_datetime(df['Planned_Delivery_Start_Date'], 'coerce',
                                                                  format='%m/%d/%Y')
  df.loc[:, 'Planned_Delivery_End_Date'] = pd.to_datetime(df['Planned_Delivery_End_Date'], 'coerce',
                                                                                      format='%m/%d/%Y')
  df = df[df['Opportunity_ID'] != 9773] #Hardcodeo este filtrado porque el id 9773 tiene mal cargada la fecha de delivery end, dando una diferencia de 200 anios xd"

  #Pongo .loc porque pandas me jode con warnings que son falsos positivos de slice copy"
  #Gracias Pandas!"

  #Creamos una nueva columna (Feature Engineering) que contiene la longitud en dias 
  #estimada de la operacion. En el informe habiamos encontrado que aparentaba haber
  #una relacion cuadratica de decrecimiento a medida que aumentaban los dias donde disminuia
  #la chance de completar la operacion.
  df['Delta_Time'] = df['Planned_Delivery_End_Date'] - df['Planned_Delivery_Start_Date']
  df.loc[:, 'Delta_Time'] = df['Delta_Time'].dt.days
  df['Delta_Time'] = df['Delta_Time'].replace({np.nan:10.0}) #Reemplazo con 10 porque los que no tienen fecha final ganan el 60%, y el analisis de los datos da que el 60% es maso a los 10 dias. Asi no jodo el resto de los datos
  df['Delta_Time'] = df.groupby('Opportunity_ID')['Delta_Time'].transform('max')

  #Pasamos todo a dolares
  currency_conversion = {'AUD':0.707612, 'EUR':1.131064, 'GBP':1.318055, 'JPY':0.008987, 'USD':1.0}
  df['Total_Taxable_Amount_Currency'] = df[['Total_Taxable_Amount_Currency']].replace(currency_conversion)
  df['Total_Taxable_Amount'] = df['Total_Taxable_Amount_Currency'] * df['Total_Taxable_Amount']

  #Modifico la columna Brand para que en vez de decir que marca es, solo diga
  #si tiene o no marca. Es importante aclarar que verificamos que siempre que una oportunidad
  #tiene un producto con marca entonces todos sus productos tienen marca. Esto se cumple
  #tanto en el set de entrenamiento como en el de test, por lo tanto al hacer drop_duplicates
  #no nos va a pasar nunca el caso donde nos pudieramos quedar con una entrada de producto
  #sin marca mientras que algun otro producto si tuviera, ya que confirmamos que o todos tienen
  #marca o ninguno tiene.
  df.loc[df['Brand'] == 'None', 'Brand'] = 0
  df.loc[df['Brand'] != 0, 'Brand'] = 1

  #Agrego una columna que indica si tiene o no numero de contrato
  df.loc[:, 'Sales_Contract_No'][df['Sales_Contract_No'] != 'None'] = 1
  df.loc[:, 'Sales_Contract_No'][df['Sales_Contract_No'] == 'None'] = 0
  df.rename(columns={'Sales_Contract_No':'Has_Contract_Number'}, inplace=True)

  #Agrego una columna que indique la cantidad de productos que tiene esa
  #oportunidad
  df['Product_Name'] = 1
  df['Product_Amount'] = df.groupby('Opportunity_ID')['Product_Name'].transform(lambda x: x.sum())

  #Agrego una columna que indica si el owner de la cuenta es el mismo que el de la oportunidad
  #o no
  df['Same_Owner'] = (df['Account_Owner'] == df['Opportunity_Owner'])
  df['Same_Owner'] = df['Same_Owner'].replace({False:0, True:1})

  #Agrego una columna que indica si tiene o no fecha de expiracion
  df['Quote_Expiry_Date'] = (df['Quote_Expiry_Date'] != 'NaT')
  df.rename(columns={'Quote_Expiry_Date':'Has_Expiry_Date'}, inplace=True)
  df['Has_Expiry_Date'] = df['Has_Expiry_Date'].replace({True:1,False:0})

  #Reemplazo las 4 columnas de aprobacion por solo 2 columnas que indiquen si tuvo la aprobacion
  #de delivery y burocratica o no. Recalco que si nunca la necesito seria equivalente a si
  #la necesito y la consiguio.
  df['Delivery_Approved'] = df['Pricing_Delivery_Terms_Quote_Appr'] + df['Pricing_Delivery_Terms_Approved']
  df['Delivery_Approved'] = df['Delivery_Approved'].replace({0:1, 1:0, 2:1})
  df['Bureaucratic_Code_Approved'] = df['Bureaucratic_Code_0_Approval'] + df['Bureaucratic_Code_0_Approved']
  df['Bureaucratic_Code_Approved'] = df['Bureaucratic_Code_Approved'].replace({0:1, 1:0, 2:1})
  df['Approved'] = df['Delivery_Approved'] | df['Bureaucratic_Code_Approved']

  #Cambio TRF por una columna que es el valor medio de los TRF de la oportunidad
  df["TRF"] = df.groupby("Opportunity_ID")["TRF"].transform("mean")

  #Pruebo volar duplicados, solo cambia el producto. Si el producto no importa
  #entonces volar duplicados no deberia importar. Obviamente vuelo el producto en el que
  #quede tambien.
  df.drop_duplicates('Opportunity_Name',inplace=True)
  df.drop(columns=['Product_Name','Product_Family','Opportunity_Name'],inplace=True)

  #Normalizo las columnas numericas
  normalized_columns = ['ASP_converted','TRF','Total_Taxable_Amount', 'Product_Amount', 'Delta_Time']
  for column in normalized_columns:
    df[column] = (df[column] - df[column].mean()) / df[column].std()

  #Borro columnas que tengan el mismo dato en todas las entradas, o inconsecuentes como el ID / Opportunity_ID
  #Algunas columnas borradas son porque pienso que no tienen incidencia, ir viendo.
  #TODO: Analizar si el Sales_Contract_No no es que importe el numero en si, sino si tiene
  #o no tiene numero de contrato. Por ahora no lo meto como input.
  #TODO: Ver el mismo tema con la columna 'Price', la mayoria tiene None u Other
  #y solo unos pocos tienen precio numerico. Quiza importe que tenga precio o no tenga,
  #o si no tiene precio quiza importe si es None u Other. Por ahora no lo pongo
  #como input.
  df.drop(columns=['Submitted_for_Approval', 'Last_Activity', 'ASP_(converted)_Currency', 
                  'Prod_Category_A', 'ID', 'Opportunity_ID', 'Actual_Delivery_Date'],inplace=True)

  #Drop columnas que quiza podamos usar pero por ahora no las uso
  df.drop(columns=['Account_Created_Date','Opportunity_Created_Date',
                  'Last_Modified_Date',
                  'Planned_Delivery_Start_Date','Planned_Delivery_End_Date',
                  'Month',
                   'Delivery_Year','Region',
                  'Price','ASP','ASP_Currency','Total_Amount_Currency',
                  'Total_Amount','Total_Taxable_Amount_Currency','Currency',
                   'Product_Category_B','Last_Modified_By', 'Account_Owner',
                   'Opportunity_Owner','Account_Name','Product_Type','Size',
                   'Territory', 'Billing_Country', 'Pricing_Delivery_Terms_Quote_Appr',
                   'Pricing_Delivery_Terms_Approved', 'Bureaucratic_Code_0_Approval',
                   'Bureaucratic_Code_0_Approved',
                  'Delivery_Approved','Bureaucratic_Code_Approved',
                   'Same_Owner','Total_Taxable_Amount','Product_Amount','ASP_converted',
                   'Has_Expiry_Date','Delivery_Quarter','Quote_Type','Approved','TRF']
                   ,inplace=True)

  #Definimos que tipo de feature es cada columna

  #Debemos separar algunos de los registros para armar un set de test propio (no el de la catedra). De esta forma sabremos rapidamente
  #si nuestro modelo esta dando resultados optimos o no sin necesidad de estar subiendo el TP a Kaggle constantemente.
  #Sin embargo, no queremos usar tantos registros ya que estariamos disminuyendo el set de entrenamiento considerablemente.
  #Podemos empezar reservando 2000 registros para el test de prueba y ver que onda. Pasariamos de tener 16 mil a 14 mil 
  #registros para el set de entrenamiento, no es una perdida importantisima creo en principio, asi que arrancamos con eso.

  #Por otro lado, nuestro test de prueba deberia tener un 50 50 de Closed Won y Closed Lost, por lo que no podemos elegir asi nomas
  #al azar.

  return df

In [3]:
df = pd.read_csv("Train_TP2_Datos_2020-2C.csv") 
df = preprocess_dataframe(df)
df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Bureaucratic_Code,Source,Has_Contract_Number,Account_Type,Opportunity_Type,Delivery_Terms,Brand,Stage,Delta_Time
0,Bureaucratic_Code_4,,0,Account_Type_2,Opportunity_Type_1,Delivery_Terms_2,0,0,0.762596
1,Bureaucratic_Code_4,,1,Account_Type_2,Opportunity_Type_1,Delivery_Terms_2,0,1,-0.430272
2,Bureaucratic_Code_4,Source_7,1,Account_Type_5,Opportunity_Type_1,Delivery_Terms_4,0,1,-0.471406
3,Bureaucratic_Code_5,Source_11,0,Account_Type_5,Opportunity_Type_19,Delivery_Terms_1,1,0,0.721462
4,Bureaucratic_Code_5,Source_11,0,Account_Type_5,Opportunity_Type_19,Delivery_Terms_1,1,0,0.083895
5,Bureaucratic_Code_5,Source_11,0,Account_Type_5,Opportunity_Type_19,Delivery_Terms_1,1,0,0.721462
6,Bureaucratic_Code_5,Source_11,0,Account_Type_5,Opportunity_Type_19,Delivery_Terms_4,0,0,1.35903
7,Bureaucratic_Code_4,Source_7,0,Account_Type_5,Opportunity_Type_1,Delivery_Terms_4,0,1,-0.471406
8,Bureaucratic_Code_5,,0,Account_Type_2,Opportunity_Type_7,Delivery_Terms_4,0,0,-0.471406
13,Bureaucratic_Code_5,,0,Account_Type_2,Opportunity_Type_7,Delivery_Terms_4,0,0,-0.471406


In [4]:
#Setup para el Label Encoding

le = preprocessing.LabelEncoder()
categorical_columns = ["Bureaucratic_Code", "Source", "Account_Type",
                       "Opportunity_Type", "Delivery_Terms"]

no_enc_df = df.copy() #Creo este aux para mantener el encoding para las predicciones

In [5]:
#Le aplico la codificacion a las columnas categoricas

for feature in categorical_columns:
  le.fit(df[feature]) #Calcula los codigos de cada valor de la serie que recibe
  df[feature] = le.transform(df[feature]) #Le asigno al df los valores codificados

#df.head(10)

In [6]:
target = df["Stage"].copy()
df.drop(columns=["Stage"],inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df.values, target, test_size=0.04) #Separa el df en sets de "training" y validacion

#Estas lineas hacen la estandarizacion a TODAS las columnas, por lo que esta mal, pero por alguna razon dan mejor que si solo normalizo las numericas
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
log_loss_min = 100
curr_log_loss = 0
best_accuracy = 0
curr_accuracy = 0
best_n = 0

model = KNeighborsClassifier()
best_model = model #Creo este aux para despues hacer las predicciones, ya que sino model queda con el ultimo i

for n in range(95,110):
  print("n =", n)
  model = KNeighborsClassifier(n_neighbors=n, algorithm="kd_tree", weights="distance")#Ver que algoritmo usar y los otros parametros
  model.fit(X_train,y_train)
  y_pred = model.predict(X_test)
  curr_log_loss = metrics.log_loss(y_test, y_pred)
  curr_accuracy = metrics.accuracy_score(y_test, y_pred)
  print(curr_log_loss, curr_accuracy)
  if (curr_log_loss < log_loss_min):
        log_loss_min = curr_log_loss
        best_accuracy = curr_accuracy
        best_n = n
        best_model = model


print(best_n, log_loss_min, best_accuracy)
    

n = 95
1.2335481262832955 0.9642857142857143
n = 96
1.2335481262832955 0.9642857142857143
n = 97
1.2335481262832955 0.9642857142857143
n = 98
1.2335481262832955 0.9642857142857143
n = 99
1.2335481262832955 0.9642857142857143
n = 100
1.2335481262832955 0.9642857142857143
n = 101
1.2335481262832955 0.9642857142857143
n = 102
1.2335481262832955 0.9642857142857143
n = 103
1.2335481262832955 0.9642857142857143
n = 104
1.2335481262832955 0.9642857142857143
n = 105
1.2335481262832955 0.9642857142857143
n = 106
1.2335481262832955 0.9642857142857143
n = 107
1.2335481262832955 0.9642857142857143
n = 108
1.2335481262832955 0.9642857142857143
n = 109
1.2335481262832955 0.9642857142857143
95 1.2335481262832955 0.9642857142857143


In [8]:
#Aca arranca la parte de las predicciones

frio_test_df = pd.read_csv('Test_TP2_Datos_2020-2C.csv')
frio_test_df['Stage'] = 'Closed Won' #Esto esta solo para que funque todo, no lo uso. No se bien como armarlo sin los labels de Stage. TODO: Averiguar como es!
pred_df = frio_test_df[['Opportunity_ID']] #Esta columna la vuela el preprocesado sino
pred_df.drop_duplicates(subset='Opportunity_ID', inplace=True)
frio_test_df = preprocess_dataframe(frio_test_df)

#frio_test_df.head(15)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
#Codifico las columnas categoricas

for feature in categorical_columns:
  le.fit(no_enc_df[feature])
  frio_test_df[feature] = le.transform(frio_test_df[feature])

frio_test_df.head(15)

Unnamed: 0,Bureaucratic_Code,Source,Has_Contract_Number,Account_Type,Opportunity_Type,Delivery_Terms,Brand,Stage,Delta_Time
0,4,12,1,0,23,2,0,1,0.245481
3,4,12,1,0,23,2,0,1,0.192431
8,4,13,1,2,22,4,0,1,-0.550276
9,2,4,1,0,1,4,0,1,-0.550276
15,4,13,1,0,22,4,0,1,-0.550276
16,4,13,1,0,22,4,0,1,-0.550276
17,2,6,0,0,23,1,0,1,1.890046
20,2,6,0,0,23,1,0,1,1.890046
24,4,6,0,0,23,1,0,1,1.890046
29,4,12,0,0,22,1,0,1,1.890046


In [10]:
target = frio_test_df["Stage"].copy()
frio_test_df.drop(columns=["Stage"],inplace=True)

features = frio_test_df.values

#Escalo todas las columnas. Como ya puse antes, en teoria esta mal, pero da mejor
scaler.fit(features)
features = scaler.transform(features)

predictions = best_model.predict_proba(features)

pred_df['Target'] = predictions[:,1]

pred_df.to_csv('prediccionesFrioFrio.csv', index=False)

In [11]:
#pred_df

In [12]:
#Aca hago el "train" sin train/test split

df = pd.read_csv("Train_TP2_Datos_2020-2C.csv") 
df = preprocess_dataframe(df)

no_enc_df = df.copy()

for feature in categorical_columns:
  le.fit(df[feature])
  df[feature] = le.transform(df[feature])

#df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [13]:
target = df["Stage"].copy()
df.drop(columns=["Stage"],inplace=True)

features = df.values

#Estas lineas hacen la estandarizacion a TODAS las columnas, por lo que esta mal, pero por alguna razon dan mejor que si solo normalizo las numericas
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)

best_model.fit(features,target)

KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=95, p=2,
                     weights='distance')

In [14]:
frio_test_df = pd.read_csv('Test_TP2_Datos_2020-2C.csv')
frio_test_df['Stage'] = 'Closed Won' #Esto esta solo para que funque todo, no lo uso. No se bien como armarlo sin los labels de Stage. TODO: Averiguar como es!
aux_df = frio_test_df[['Opportunity_ID']] #Esta columna la vuela el preprocesado sino
aux_df.drop_duplicates(subset='Opportunity_ID', inplace=True)

frio_test_df = preprocess_dataframe(frio_test_df)

for feature in categorical_columns:
  le.fit(no_enc_df[feature])
  frio_test_df[feature] = le.transform(frio_test_df[feature])

#frio_test_df.head(15)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [15]:
target = frio_test_df["Stage"].copy()
frio_test_df.drop(columns=["Stage"],inplace=True)
test_features = frio_test_df.values

scaler = StandardScaler()
scaler.fit(test_features)
test_features = scaler.transform(test_features)

predictions = best_model.predict_proba(test_features)

aux_df['Target'] = predictions[:,1]

aux_df.to_csv('prediccionesFrioFrioSinTest.csv', index=False)

In [16]:
aux_df

Unnamed: 0,Opportunity_ID,Target
0,10689,0.969865
3,10690,0.970079
8,10691,0.954997
9,10692,0.927974
15,10693,0.926891
...,...,...
2545,12364,0.990232
2547,12365,0.835656
2548,12366,0.835656
2549,12367,0.101387
