In [1]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import random
random.seed(42)
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score


In [3]:
df_train = pd.read_csv("/content/drive/My Drive/intermediate/train.csv")
df_test = pd.read_csv("/content/drive/My Drive/intermediate/test.csv")

In [4]:
df_train

Unnamed: 0.1,Unnamed: 0,national_inv,lead_time,in_transit_qty,forecast_3_month,sales_1_month,min_bank,potential_issue,pieces_past_due,perf_6_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,1,0.477121,9.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.99,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,5,1.146128,8.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.82,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,9,0.698970,8.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.82,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,10,0.000000,2.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.91,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,15,1.079181,8.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.82,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648016,1929905,0.477121,8.0,0.0,0.0,0.00000,2.0,0.0,0.0,0.90,0.0,0.0,0.0,0.0,1.0,0.0,0.0
648017,1929908,1.255273,4.0,0.0,0.0,0.30103,0.0,0.0,0.0,0.73,0.0,0.0,0.0,0.0,1.0,0.0,0.0
648018,1929917,1.792392,4.0,0.0,0.0,0.30103,3.0,0.0,0.0,0.73,0.0,0.0,0.0,0.0,1.0,0.0,0.0
648019,1929918,0.602060,4.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.73,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
df_train.drop(columns=["Unnamed: 0"], inplace=True, errors='ignore')
df_test.drop(columns=["Unnamed: 0"], inplace=True, errors='ignore')
df_train = df_train.replace([np.inf, -np.inf], np.nan)
df_test = df_test.replace([np.inf, -np.inf], np.nan)
df_train = df_train.dropna()
df_test = df_test.dropna()
df_train = df_train.reset_index()
df_test = df_test.reset_index()

In [6]:
X_train = df_train.drop(columns=['went_on_backorder'])
y_train = df_train['went_on_backorder']
X_test = df_test.drop(columns=['went_on_backorder'])
y_test = df_test['went_on_backorder']

In [7]:
X_train

Unnamed: 0,index,national_inv,lead_time,in_transit_qty,forecast_3_month,sales_1_month,min_bank,potential_issue,pieces_past_due,perf_6_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop
0,0,0.477121,9.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.99,0.0,0.0,0.0,0.0,1.0,0.0
1,1,1.146128,8.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.82,0.0,0.0,0.0,0.0,1.0,0.0
2,2,0.698970,8.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.82,0.0,0.0,0.0,0.0,1.0,0.0
3,3,0.000000,2.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.91,0.0,0.0,0.0,0.0,1.0,0.0
4,4,1.079181,8.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.82,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647946,648016,0.477121,8.0,0.0,0.0,0.00000,2.0,0.0,0.0,0.90,0.0,0.0,0.0,0.0,1.0,0.0
647947,648017,1.255273,4.0,0.0,0.0,0.30103,0.0,0.0,0.0,0.73,0.0,0.0,0.0,0.0,1.0,0.0
647948,648018,1.792392,4.0,0.0,0.0,0.30103,3.0,0.0,0.0,0.73,0.0,0.0,0.0,0.0,1.0,0.0
647949,648019,0.602060,4.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.73,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
K = 10

for fold in range(0, K):

  # Instantiate algorithm
  model = RandomForestClassifier(random_state = 42)
  scaler = MinMaxScaler()

  # Scale X data, we scale the data because it helps the algorithm to converge
  # and helps the algorithm to not be greedy with large values
  scaler.fit(X_train)
  X_train = scaler.transform(X_train)
  X_test = scaler.transform(X_test)

  # Train model
  trained_model = model.fit(X_train, y_train)

  # Generate predictions on test sample
  y_pred = trained_model.predict(X_test)

  # Compute accuracy
print ("Accuracy : ", accuracy_score(y_test, y_pred))


Accuracy :  1.0
