# **ML Regression Model**

In [1]:
# importing the libraries
import pandas as pd
import pickle
from collections import Counter

In [2]:
# reading the cleaned data csv file
df1 = pd.read_csv("cleaned_copper_dataset.csv")

In [3]:
# features
df1.columns

Index(['id', 'item_date', 'quantity tons', 'customer', 'country', 'status',
       'item type', 'application', 'thickness', 'width', 'material_ref',
       'product_ref', 'delivery date', 'selling_price', 'item_date_year',
       'item_date_month', 'item_date_day', 'delivery_date_year',
       'delivery_date_month', 'delivery_date_day'],
      dtype='object')

In [4]:
# input features(X)
X = df1[['quantity tons', 'customer', 'country',"status", "item type",
        'application', 'thickness', 'width',
       'product_ref','item_date_year',
       'item_date_month', 'item_date_day', 'delivery_date_year',
       'delivery_date_month', 'delivery_date_day']]
# output feature(Y)
Y = df1["selling_price"]

In [5]:
# shape of the dataset
X.shape

(181620, 15)

In [6]:
# encoding categorical variables
status_ohe = pd.get_dummies(X["status"])
item_type_ohe = pd.get_dummies(X["item type"])

In [7]:
# Concatenating the encoded variables to the dataframe and dropping the original categorical features
X = pd.concat([X, status_ohe, item_type_ohe], axis = 1)
X.drop(columns = ["status", "item type"], axis = 1, inplace = True)
X.head()

Unnamed: 0,quantity tons,customer,country,application,thickness,width,product_ref,item_date_year,item_date_month,item_date_day,...,To be approved,Won,Wonderful,IPL,Others,PL,S,SLAWR,W,WI
0,3.991779,30156308.0,28.0,2.302585,0.693147,7.31322,1670798778,2021,4,1,...,False,True,False,False,False,False,False,False,True,False
1,6.643822,30202938.0,25.0,3.713572,-0.223144,7.098376,1668701718,2021,4,1,...,False,True,False,False,False,False,False,False,True,False
2,5.956169,30153963.0,30.0,3.332205,-0.967584,6.858565,628377,2021,4,1,...,False,True,False,False,False,False,False,False,False,True
3,5.310301,30349574.0,32.0,4.077537,0.832909,7.183112,1668701718,2021,4,1,...,False,True,False,False,False,False,True,False,False,False
4,6.666354,30211560.0,28.0,2.302585,1.386294,7.600902,640665,2021,4,1,...,False,True,False,False,False,False,False,False,True,False


In [8]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

# train test split(train - 70% and test - 30%)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3, random_state = 42)

# algorithms
algorithms = [XGBRegressor, RandomForestRegressor, ExtraTreesRegressor, 
              GradientBoostingRegressor, HistGradientBoostingRegressor,]

for i in algorithms:
    model = i()
    model.fit(X_train, Y_train)

    pred_y_train = model.predict(X_train)
    pred_y_test = model.predict(X_test)

    print("Algorithm:", i.__name__)
    
    from sklearn.metrics import mean_squared_error
    print("Train MSE:", mean_squared_error(Y_train, pred_y_train),"----->", "Test MSE:", mean_squared_error(Y_test, pred_y_test))         

    from sklearn.metrics import r2_score
    print("Train r2_score:", r2_score(Y_train, pred_y_train),"-->", "Test r2_score:", r2_score(Y_test, pred_y_test))
    print("\n")

Algorithm: XGBRegressor
Train MSE: 0.004743554324163595 -----> Test MSE: 0.005093367061423735
Train r2_score: 0.9849932691177563 --> Test r2_score: 0.9842296106392963


Algorithm: RandomForestRegressor
Train MSE: 0.001051588027123614 -----> Test MSE: 0.00413825160866528
Train r2_score: 0.9966731911466374 --> Test r2_score: 0.9871868965353996


Algorithm: ExtraTreesRegressor
Train MSE: 6.692788957008762e-29 -----> Test MSE: 0.004554863897446539
Train r2_score: 1.0 --> Test r2_score: 0.9858969565158995


Algorithm: GradientBoostingRegressor
Train MSE: 0.010236527750697713 -----> Test MSE: 0.008687958056915449
Train r2_score: 0.9676156724208219 --> Test r2_score: 0.9730998218556193


Algorithm: HistGradientBoostingRegressor
Train MSE: 0.007770047045918459 -----> Test MSE: 0.005966665931074346
Train r2_score: 0.9754186424372759 --> Test r2_score: 0.9815256501674581




##  out of all the five models, random forest regressor is the best model as it is having lower mse and higher r2 score.

In [9]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3, random_state = 42)

rf = RandomForestRegressor(min_samples_split = 4)  
rf.fit(X_train, Y_train)

pred_y_train = rf.predict(X_train)
pred_y_test = rf.predict(X_test)                                                           
                                                                        
from sklearn.metrics import mean_squared_error
print("Train MSE:", mean_squared_error(Y_train, pred_y_train))       
print("Test MSE :", mean_squared_error(Y_test, pred_y_test))               
print("\n")
                                                                                                                
from sklearn.metrics import r2_score                                                     
print("Train r2_score:", r2_score(Y_train, pred_y_train))                                 
print("Test r2_score :", r2_score(Y_test, pred_y_test))

Train MSE: 0.0015836737610479348
Test MSE : 0.004131089471263345


Train r2_score: 0.994989882203677
Test r2_score : 0.9872090723758842


## saving the best model (random forest)

In [12]:
with open("regression_model.pkl", "wb") as file:
    pickle.dump(rf, file)

# **ML Classification Model**

In [13]:
# reading the cleaned dataset csv file
df2 = pd.read_csv("cleaned_copper_dataset.csv")
df2.head()

Unnamed: 0,id,item_date,quantity tons,customer,country,status,item type,application,thickness,width,material_ref,product_ref,delivery date,selling_price,item_date_year,item_date_month,item_date_day,delivery_date_year,delivery_date_month,delivery_date_day
0,EC06F063-9DF0-440C-8764-0B0C05A4F6AE,20210401.0,3.991779,30156308.0,28.0,Won,W,2.302585,0.693147,7.31322,DEQ1 S460MC,1670798778,20210701.0,6.749931,2021,4,1,2021,7,1
1,4E5F4B3D-DDDF-499D-AFDE-A3227EC49425,20210401.0,6.643822,30202938.0,25.0,Won,W,3.713572,-0.223144,7.098376,0000000000000000000000000000000000104991,1668701718,20210401.0,6.953684,2021,4,1,2021,4,1
2,E140FF1B-2407-4C02-A0DD-780A093B1158,20210401.0,5.956169,30153963.0,30.0,Won,WI,3.332205,-0.967584,6.858565,S0380700,628377,20210101.0,6.468211,2021,4,1,2021,1,1
3,F8D507A0-9C62-4EFE-831E-33E1DA53BB50,20210401.0,5.310301,30349574.0,32.0,Won,S,4.077537,0.832909,7.183112,DX51D+ZM310MAO 2.3X1317,1668701718,20210101.0,6.64379,2021,4,1,2021,1,1
4,4E1C4E78-152B-430A-8094-ADD889C9D0AD,20210401.0,6.666354,30211560.0,28.0,Won,W,2.302585,1.386294,7.600902,2_S275JR+AR-CL1,640665,20210301.0,6.357842,2021,4,1,2021,3,1


In [14]:
# getting only those rows which are having "Won" and "Lost" values in status feature(target feature)
new_df = df2.loc[(df2["status"] == "Won") | (df2["status"] == "Lost")]
new_df.head()

Unnamed: 0,id,item_date,quantity tons,customer,country,status,item type,application,thickness,width,material_ref,product_ref,delivery date,selling_price,item_date_year,item_date_month,item_date_day,delivery_date_year,delivery_date_month,delivery_date_day
0,EC06F063-9DF0-440C-8764-0B0C05A4F6AE,20210401.0,3.991779,30156308.0,28.0,Won,W,2.302585,0.693147,7.31322,DEQ1 S460MC,1670798778,20210701.0,6.749931,2021,4,1,2021,7,1
1,4E5F4B3D-DDDF-499D-AFDE-A3227EC49425,20210401.0,6.643822,30202938.0,25.0,Won,W,3.713572,-0.223144,7.098376,0000000000000000000000000000000000104991,1668701718,20210401.0,6.953684,2021,4,1,2021,4,1
2,E140FF1B-2407-4C02-A0DD-780A093B1158,20210401.0,5.956169,30153963.0,30.0,Won,WI,3.332205,-0.967584,6.858565,S0380700,628377,20210101.0,6.468211,2021,4,1,2021,1,1
3,F8D507A0-9C62-4EFE-831E-33E1DA53BB50,20210401.0,5.310301,30349574.0,32.0,Won,S,4.077537,0.832909,7.183112,DX51D+ZM310MAO 2.3X1317,1668701718,20210101.0,6.64379,2021,4,1,2021,1,1
4,4E1C4E78-152B-430A-8094-ADD889C9D0AD,20210401.0,6.666354,30211560.0,28.0,Won,W,2.302585,1.386294,7.600902,2_S275JR+AR-CL1,640665,20210301.0,6.357842,2021,4,1,2021,3,1


In [15]:
# the shape of the data after doing the above step
new_df.shape

(150427, 20)

In [16]:
# mapping "won" with 1 and "lost" with 0
new_df.loc[:, "status"] = new_df.loc[:,"status"].map({"Won": 1, "Lost": 0})

In [17]:
new_df["status"] = new_df["status"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["status"] = new_df["status"].astype(int)


In [18]:
new_df.reset_index(inplace =  True, drop = True)

In [19]:
# input features(X)
X = new_df[['quantity tons', 'customer', 'country', "item type",
        'application', 'thickness', 'width',
       'product_ref','item_date_year',
       'item_date_month', 'item_date_day', 'delivery_date_year',
       'delivery_date_month', 'delivery_date_day', "selling_price"]]
# output feature(Y)
Y = new_df["status"]

In [20]:
# categorical encoding
item_type_ohe = pd.get_dummies(X["item type"])

In [21]:
# concatenating the encoded feature to the dataframe
X = pd.concat([X, item_type_ohe], axis = 1)
X.drop(columns = ["item type"], axis = 1, inplace = True)
X.head()

Unnamed: 0,quantity tons,customer,country,application,thickness,width,product_ref,item_date_year,item_date_month,item_date_day,...,delivery_date_month,delivery_date_day,selling_price,IPL,Others,PL,S,SLAWR,W,WI
0,3.991779,30156308.0,28.0,2.302585,0.693147,7.31322,1670798778,2021,4,1,...,7,1,6.749931,False,False,False,False,False,True,False
1,6.643822,30202938.0,25.0,3.713572,-0.223144,7.098376,1668701718,2021,4,1,...,4,1,6.953684,False,False,False,False,False,True,False
2,5.956169,30153963.0,30.0,3.332205,-0.967584,6.858565,628377,2021,4,1,...,1,1,6.468211,False,False,False,False,False,False,True
3,5.310301,30349574.0,32.0,4.077537,0.832909,7.183112,1668701718,2021,4,1,...,1,1,6.64379,False,False,False,True,False,False,False
4,6.666354,30211560.0,28.0,2.302585,1.386294,7.600902,640665,2021,4,1,...,3,1,6.357842,False,False,False,False,False,True,False


## Imbalanced data

In [22]:
print("Won -->", Y.value_counts().__getitem__(1)/Y.shape[0]*100, "%")
print("Lost-->", Y.value_counts().__getitem__(0)/Y.shape[0]*100, "%")

Won --> 77.11115690667233 %
Lost--> 22.888843093327658 %


In [23]:
# using SMOTE(Synthetic Minority OverSampling TechniquE) to handle imbalanced data
print('Before:', Counter(Y))
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy = 1, k_neighbors = 3, random_state = 1)
resampled_X, resampled_Y = sm.fit_resample(X, Y)
print('Before:',Counter(resampled_Y))

Before: Counter({1: 115996, 0: 34431})
Before: Counter({1: 115996, 0: 115996})


In [24]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

# train test split(train - 70% and test - 30%)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(resampled_X, resampled_Y, test_size=.3, random_state = 42)

# algorithms
algorithms = [XGBClassifier, RandomForestClassifier, ExtraTreesClassifier, 
              GradientBoostingClassifier, HistGradientBoostingClassifier,]

for i in algorithms:
    model = i()
    model.fit(X_train, Y_train)

    pred_y_train = model.predict(X_train)
    pred_y_test = model.predict(X_test)

    print("Algorithm:", i.__name__)
    from sklearn.metrics import accuracy_score
    print("Train accuracy:", accuracy_score(Y_train, pred_y_train))            
    print("Test accuracy:", accuracy_score(Y_test, pred_y_test))  
    print("\n")                  

Algorithm: XGBClassifier
Train accuracy: 0.9501644149414388
Test accuracy: 0.940831633092905


Algorithm: RandomForestClassifier
Train accuracy: 1.0
Test accuracy: 0.9784189200839104


Algorithm: ExtraTreesClassifier
Train accuracy: 1.0
Test accuracy: 0.9830167533549815


Algorithm: GradientBoostingClassifier
Train accuracy: 0.8161508430114414
Test accuracy: 0.8143624816805081


Algorithm: HistGradientBoostingClassifier
Train accuracy: 0.9060679581757947
Test accuracy: 0.9009597976953361




## out of five models, extra trees classifier is the best model having higher accuracy

In [25]:
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(resampled_X, resampled_Y, test_size=.3, random_state = 42)


et = ExtraTreesClassifier()  
et.fit(X_train, Y_train)

pred_y_train = et.predict(X_train)
pred_y_test = et.predict(X_test)                                                          
                                                                                                
from sklearn.metrics import accuracy_score
print("Train accuracy:", accuracy_score(Y_train, pred_y_train))        
print("Test accuracy :", accuracy_score(Y_test, pred_y_test))               

Train accuracy: 1.0
Test accuracy : 0.9830167533549815


In [26]:
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, pred_y_test)

array([[34721,   323],
       [  859, 33695]], dtype=int64)

In [27]:
# classification metrics
from sklearn.metrics import classification_report
print(classification_report(Y_test, pred_y_test))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98     35044
           1       0.99      0.98      0.98     34554

    accuracy                           0.98     69598
   macro avg       0.98      0.98      0.98     69598
weighted avg       0.98      0.98      0.98     69598



## saving the best model (extra trees classifier model)

In [28]:
with open("classification_model.pkl", "wb") as file:
    pickle.dump(et, file)