In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [19]:
#Leitura do arquivo
df = pd.read_excel(r'\\srv-ameixa\Setores2\Gerenciamento de Categorias\00. NOVO GC\7. Novos Produtos\Desempenho Novos Produtos\202306\Análise Novos Produtos_202206.xlsx'
                  ,sheet_name = 'random_forest')

# Tratamento

In [20]:
#Tratamento
df = df.dropna(subset=['TARGET']) #Considerando apenas produtos avaliados. Dropando demais linhas

#Tratamento de colunas categoricas e numericas
colunas_categoricas = ['Produto_novo','UF','DEPTO','SECAO','CATE',
                       'SUBCATE','TARGET'] 
for col in colunas_categoricas:
    df[col] = pd.Categorical(df[col])

cols_numericas = ['LOJAS', 'RECEITA_POR_LOJA', 'RECEITA_POR_LOJA_SUBCATE', '%_PERDA', '%_PERDA_SUBCATE',
              'NOTA_FINAL', 'NOTA_LB_%', 'NOTA_%_PERDA', 'NOTA_CRESCIMENTO_RECEITA', 'NOTA_VENDA_VS_SUBCATE']

df[cols_numericas] = df[cols_numericas].astype('float32')

df['COD'] = df['COD'].astype('int32')

#Substituindo nulos por 0
df['NOTA_FINAL'].fillna(0, inplace=True)
df['%_PERDA'].fillna(0, inplace=True)
df['RECEITA_POR_LOJA'].fillna(0, inplace=True)
df['%_PERDA_SUBCATE'].fillna(0, inplace=True)

# Modelagem

In [21]:
df_processed = df.copy()
target_values = df['TARGET'].unique()
target_translation = pd.DataFrame({'TARGET': target_values})

# Convert distinct values in each column to numbers
for col in colunas_categoricas:
    label_encoder = LabelEncoder()
    col_values = df[col].unique()

    if len(col_values) > 500:
        print(f"Skipping column '{col}' due to exceeding maximum distinct values.")
        continue

    for i, value in enumerate(col_values):
        if i >= 500:
            print(f"Reached maximum loops for column '{col}'.")
            break
        df_processed[col] = np.where(df_processed[col] == value, i + 1, df_processed[col])
#print(1)        
        
cols_numericas = ['LOJAS', 'RECEITA_POR_LOJA', 'RECEITA_POR_LOJA_SUBCATE', '%_PERDA', '%_PERDA_SUBCATE',
              'NOTA_FINAL', 'NOTA_LB_%', 'NOTA_%_PERDA', 'NOTA_CRESCIMENTO_RECEITA', 'NOTA_VENDA_VS_SUBCATE']
df[cols_numericas] = df[cols_numericas].astype('float32')

df['COD'] = df['COD'].astype('int32')
#print(2) 

X_processed = df_processed.drop(['TARGET'], axis=1)
y_processed = df_processed['TARGET']

#print(3)
# Convert numeric columns back to categorical
for col in colunas_categoricas:
    if col != 'TARGET':
        df_processed[col] = df_processed[col].astype('int32').astype('category')

#Convert numeric columns back to categorical
for col in colunas_categoricas:
    if col != 'TARGET':
        df_processed[col] = df_processed[col].astype('int32').astype('category')
#print(4)

# Convert the target variable to numeric encoding

y_encoded = label_encoder.fit_transform(y_processed)

# Train-test split on the processed data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=42)

# Create and fit the random forest classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf.predict(X_test)

# Decode the predicted labels back to original categories
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Avaliação do Modelo

In [22]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
classification_rep = classification_report(y_test, y_pred)
confusion_mtx = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", confusion_mtx)

Accuracy: 0.9395973154362416
Precision: 0.9373474988452027
Recall: 0.9395973154362416
F1 Score: 0.9377400059744728
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.73      0.78        22
           1       0.95      0.98      0.96       127

    accuracy                           0.94       149
   macro avg       0.90      0.85      0.87       149
weighted avg       0.94      0.94      0.94       149

Confusion Matrix:
 [[ 16   6]
 [  3 124]]


In [23]:
X_test['Sugestao'] = y_test_decoded

In [24]:
X_test.to_excel(r'C:\Users\mateus.craveiro\OneDrive - Hortifruti Natural da Terra\Área de Trabalho\RF.xlsx')