# Modeling

### Preprocessing

In [42]:
# Importing libraries
# Importing alias for easy reading

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import tensorflow as tf

In [44]:
# Reading data from csv
df = pd.read_csv('data.csv')

In [45]:
df.head(3)

Unnamed: 0,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Profit Percentage,Year Month
0,CA-2016-152156,2016-11-08,11/11/2016,Standard Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,230.864435,2.0,0.0,41.9136,18.15507,2016-11
1,CA-2017-100111,2016-11-08,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,California,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3.0,0.0,219.582,30.0,2016-11
2,CA-2017-100111,2016-06-12,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2.0,0.0,6.8714,47.0,2016-06


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Order ID           9994 non-null   object 
 1   Order Date         9994 non-null   object 
 2   Ship Date          9994 non-null   object 
 3   Ship Mode          9994 non-null   object 
 4   Customer ID        9994 non-null   object 
 5   Customer Name      9994 non-null   object 
 6   Segment            9994 non-null   object 
 7   Country            9994 non-null   object 
 8   City               9994 non-null   object 
 9   State              9994 non-null   object 
 10  Postal Code        9994 non-null   float64
 11  Region             9994 non-null   object 
 12  Product ID         9994 non-null   object 
 13  Category           9994 non-null   object 
 14  Sub-Category       9994 non-null   object 
 15  Product Name       9994 non-null   object 
 16  Sales              9994 

In [47]:
def encode_dates(df, column):
    df = df.copy()

    df[column] = pd.to_datetime(df[column])
    df[column +'_year'] = df[column].apply(lambda x: x.year)
    df[column +'_month'] = df[column].apply(lambda x: x.month)
    df[column +'_day'] = df[column].apply(lambda x: x.day)

    df = df.drop(column, axis=1)
    return df

def onehot_encode(df, column):
    df = df.copy()

    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [48]:
def preprocess_input(df):
    df = df.copy()

    # Dropping Unnecessary Columns
    df = df.drop(['Country', 'Customer Name', 'Product Name', 'Year Month', 'Profit Percentage'], axis=1)

    # Dropping customer specific features
    df = df.drop(['Order ID', 'Customer ID'], axis = 1)

    # Date Feature Engineering - Extraction
    df = encode_dates(df, column = 'Order Date')
    df = encode_dates(df, column = 'Ship Date')

    #Handling Categorical Features - onehot encoding
    categorical_cols = df.select_dtypes(include='object')
    for column in categorical_cols:
        df = onehot_encode(df, column = column)

    # Target(y) and Predictive Features(X)
    y = df['Sales']
    X = df.drop('Sales', axis=1)

    # Train Test Split!
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, shuffle=True, random_state=1)

    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

    return X_train, X_test, y_train, y_test

In [49]:
X_train, X_test, y_train, y_test = preprocess_input(df)

In [50]:
X_train.head(3)

Unnamed: 0,Postal Code,Quantity,Discount,Profit,Order Date_year,Order Date_month,Order Date_day,Ship Date_year,Ship Date_month,Ship Date_day,Ship Mode_First Class,Ship Mode_Same Day,Ship Mode_Second Class,Ship Mode_Standard Class,Segment_Consumer,Segment_Corporate,Segment_Home Office,City_Aberdeen,City_Abilene,City_Akron,City_Albuquerque,City_Alexandria,City_Allen,City_Allentown,City_Altoona,City_Amarillo,City_Anaheim,City_Andover,City_Ann Arbor,City_Antioch,City_Apopka,City_Apple Valley,City_Appleton,City_Arlington,City_Arlington Heights,City_Arvada,City_Asheville,City_Athens,City_Atlanta,City_Atlantic City,...,Product ID_TEC-PH-10004522,Product ID_TEC-PH-10004531,Product ID_TEC-PH-10004536,Product ID_TEC-PH-10004539,Product ID_TEC-PH-10004586,Product ID_TEC-PH-10004614,Product ID_TEC-PH-10004667,Product ID_TEC-PH-10004700,Product ID_TEC-PH-10004774,Product ID_TEC-PH-10004830,Product ID_TEC-PH-10004833,Product ID_TEC-PH-10004875,Product ID_TEC-PH-10004896,Product ID_TEC-PH-10004897,Product ID_TEC-PH-10004908,Product ID_TEC-PH-10004912,Product ID_TEC-PH-10004922,Product ID_TEC-PH-10004924,Product ID_TEC-PH-10004959,Product ID_TEC-PH-10004977,Category_Furniture,Category_Office Supplies,Category_Technology,Sub-Category_Accessories,Sub-Category_Appliances,Sub-Category_Art,Sub-Category_Binders,Sub-Category_Bookcases,Sub-Category_Chairs,Sub-Category_Copiers,Sub-Category_Envelopes,Sub-Category_Fasteners,Sub-Category_Furnishings,Sub-Category_Labels,Sub-Category_Machines,Sub-Category_Paper,Sub-Category_Phones,Sub-Category_Storage,Sub-Category_Supplies,Sub-Category_Tables
0,0.34457,0.574248,-0.800968,-0.082708,1.177163,0.340322,-0.505824,1.231639,0.24008,-0.109235,-0.406749,-0.228811,2.176766,-1.316215,0.87062,-0.605154,-0.441486,-0.011957,-0.011957,-0.035893,-0.033838,-0.033838,-0.016912,-0.020714,-0.016912,-0.026745,-0.054874,-0.020714,-0.020714,0.0,-0.026745,-0.02392,-0.016912,-0.077721,-0.011957,-0.020714,-0.020714,-0.0293,-0.05989,-0.011957,...,-0.011957,-0.020714,-0.011957,-0.026745,-0.020714,-0.026745,-0.016912,-0.016912,-0.016912,-0.011957,-0.016912,0.0,-0.020714,-0.02392,-0.026745,-0.016912,-0.02392,-0.011957,0.0,-0.0293,-0.489254,0.745074,-0.442641,-0.274758,-0.202676,-0.281322,-0.55142,-0.149039,-0.243653,-0.083991,-0.155911,-0.144979,-0.309367,-0.180623,-0.096093,2.667367,-0.289814,-0.293565,-0.132116,-0.174193
1,1.226612,1.99361,-0.800968,-0.006431,0.242726,1.299192,0.842762,0.314238,1.121172,1.439458,-0.406749,-0.228811,-0.459397,0.759754,0.87062,-0.605154,-0.441486,-0.011957,-0.011957,-0.035893,-0.033838,-0.033838,-0.016912,-0.020714,-0.016912,-0.026745,-0.054874,-0.020714,-0.020714,0.0,-0.026745,-0.02392,-0.016912,-0.077721,-0.011957,-0.020714,-0.020714,-0.0293,-0.05989,-0.011957,...,-0.011957,-0.020714,-0.011957,-0.026745,-0.020714,-0.026745,-0.016912,-0.016912,-0.016912,-0.011957,-0.016912,0.0,-0.020714,-0.02392,-0.026745,-0.016912,-0.02392,-0.011957,0.0,-0.0293,2.043929,-1.342148,-0.442641,-0.274758,-0.202676,-0.281322,1.813499,-0.149039,-0.243653,-0.083991,-0.155911,-0.144979,-0.309367,-0.180623,-0.096093,-0.374901,-0.289814,-0.293565,-0.132116,-0.174193
2,1.142682,0.101128,0.236605,-0.062887,-1.626147,1.299192,0.505615,-1.520565,1.121172,0.605546,-0.406749,-0.228811,-0.459397,0.759754,-1.148607,1.652473,-0.441486,-0.011957,-0.011957,-0.035893,-0.033838,-0.033838,-0.016912,-0.020714,-0.016912,-0.026745,-0.054874,-0.020714,-0.020714,0.0,-0.026745,-0.02392,-0.016912,-0.077721,-0.011957,-0.020714,-0.020714,-0.0293,-0.05989,-0.011957,...,-0.011957,-0.020714,-0.011957,-0.026745,-0.020714,-0.026745,-0.016912,-0.016912,-0.016912,-0.011957,-0.016912,0.0,-0.020714,-0.02392,-0.026745,-0.016912,-0.02392,-0.011957,0.0,-0.0293,-0.489254,-1.342148,2.259168,-0.274758,-0.202676,-0.281322,-0.55142,-0.149039,-0.243653,-0.083991,-0.155911,-0.144979,-0.309367,-0.180623,-0.096093,-0.374901,3.450493,-0.293565,-0.132116,-0.174193


### Model Training

In [51]:
inputs = X_train.shape[1]

In [52]:
from keras.models import Sequential
from keras.layers import Dense

# Defining model architecture
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(inputs,)))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(2, activation='relu'))
model.add(Dense(1, activation='linear'))

print(model.summary())


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 128)               315392    
                                                                 
 dense_11 (Dense)            (None, 64)                8256      
                                                                 
 dense_12 (Dense)            (None, 32)                2080      
                                                                 
 dense_13 (Dense)            (None, 16)                528       
                                                                 
 dense_14 (Dense)            (None, 8)                 136       
                                                                 
 dense_15 (Dense)            (None, 4)                 36        
                                                                 
 dense_16 (Dense)            (None, 2)                

In [53]:
model.compile(
    optimizer = 'adam',
    loss = 'mse',
    metrics = 'accuracy'
)

In [54]:
model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau()
    ]
)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


<keras.src.callbacks.History at 0x244864e5e90>

In [55]:
test_loss = model.evaluate(X_test, y_test)

print("Test Loss: {:.5f}".format(test_loss[0]))

Test Loss: 220839.67188


In [56]:
y_pred = np.squeeze(model.predict(X_test))
test_r2 = r2_score(y_test, y_pred)

print("Test R2 score: {:.5f}".format(test_r2))

Test R2 score: 0.24554
