# **Modeling**

# **Imports**

In [1]:
#Numpy
import numpy as np
from numpy import median

#Pandas
import pandas as pd

#Seaborn
import seaborn as sns

#matplotlib
import matplotlib.pyplot as plt
import plotly

#Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#Warnings
import warnings
warnings.filterwarnings("ignore")

# **Loading Data**

In [2]:
#Loading in the data from the previous notebook
fd = pd.read_csv('fd')

In [3]:
fd.head()

Unnamed: 0,Order #,Transaction Type,Order Time,Order Date,Delivery Date_x,Product Total,Nontaxable Delivery,Wire Out Fee,Gift Cards,Grand Total_x,...,Order Type,Order Method,Order Source,Delivery Method,Product Name,Occasion,Delivery Street,Delivery City,Delivery Zip,Subtotal
0,100017453.0,Sale,2023-07-22 15:28:16,2018-12-29,2018-12-29,815.0,0.0,0.0,0.0,894.46,...,Taken,Phone,Premium Site,Local Delivery,Vivid Daydream,Birthday,2179 Hillsboro Road,Franklin,37064.0,75.0
1,100017452.0,Sale,2023-07-22 15:16:50,2018-12-29,2018-12-29,249.9,0.0,0.0,0.0,137.13,...,Taken,Walk-In,Premium Site,Local Delivery,Vivid Daydream,Birthday,2179 Hillsboro Road,Franklin,37064.0,75.0
2,100017451.0,Sale,2023-07-22 14:58:53,2018-12-29,2018-12-29,24.95,0.0,0.0,0.0,27.38,...,Taken,Walk-In,Premium Site,Local Delivery,Vivid Daydream,Birthday,2179 Hillsboro Road,Franklin,37064.0,75.0
3,100017450.0,Sale,2023-07-22 14:54:45,2018-12-29,2018-12-29,635.0,0.0,0.0,0.0,696.91,...,Taken,Phone,Premium Site,Local Delivery,Vivid Daydream,Birthday,2179 Hillsboro Road,Franklin,37064.0,75.0
4,100017202.0,Sale,2023-07-22 14:48:15,2018-12-29,2018-12-23,702.0,25.0,0.0,0.0,795.45,...,Delivery,Walk-In,Premium Site,Local Delivery,Vivid Daydream,Birthday,2179 Hillsboro Road,Franklin,37064.0,75.0


In [4]:
fd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51511 entries, 0 to 51510
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Order #              51511 non-null  float64
 1   Transaction Type     51511 non-null  object 
 2   Order Time           51511 non-null  object 
 3   Order Date           51511 non-null  object 
 4   Delivery Date_x      51511 non-null  object 
 5   Product Total        51511 non-null  float64
 6   Nontaxable Delivery  51511 non-null  float64
 7   Wire Out Fee         51511 non-null  float64
 8   Gift Cards           51511 non-null  float64
 9   Grand Total_x        51511 non-null  float64
 10  Payment Method_x     51511 non-null  object 
 11  Order Type           51511 non-null  object 
 12  Order Method         51511 non-null  object 
 13  Order Source         51511 non-null  object 
 14  Delivery Method      51511 non-null  object 
 15  Product Name         51511 non-null 

In [5]:
import numpy as np
import pandas as pd

def change_column_datatype(fd, column_dict):
    """
    Change the data type of columns in a Pandas DataFrame.

    Parameters:
    - df: Pandas DataFrame object.
    - column_dict: Dictionary specifying column names and their desired data types.
                   Key: column name (string).
                   Value: desired data type (string).
                   Example: {'column1': 'int', 'column2': 'float', 'column3': 'datetime'}
    Returns:
    - df: Updated Pandas DataFrame with changed data types.
    """
    try:
        for column, datatype in column_dict.items():
            if datatype == 'float':
                fd[column] = fd[column].str.replace('$', '').str.replace(',', '').str.replace('()', '')
                fd[column] = fd[column].apply(lambda x: float(x.replace('(', '').replace(')', '')) if isinstance(x, str) else x)
                fd[column] = fd[column].replace([np.inf, -np.inf], np.nan).astype(float)
            elif datatype == 'int':
                fd[column] = fd[column].replace(['NA', 'inf', '-inf'], np.nan).astype(float).astype(pd.Int64Dtype())
            elif datatype == 'datetime':
                fd[column] = pd.to_datetime(fd[column], errors='coerce')
            else:
                fd[column] = fd[column].astype(datatype)
        return fd
    except KeyError as e:
        print(f"Error: Column '{e.args[0]}' does not exist in the DataFrame.")
    except Exception as e:
        print(f"Error: {str(e)}")

In [6]:
def replace_nan(fd):
    # Find categorical and numerical columns
    categorical_cols = fd.select_dtypes(include='object').columns
    numerical_cols = fd.select_dtypes(include=np.number).columns
    date_cols = ['Order Time', 'Order Date', 'Delivery Date_x']
# Replace NaT values with NaN
    fd = fd.replace(pd.NaT, np.nan)

# Impute missing values with mode for categorical variables
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    fd[categorical_cols] = categorical_imputer.fit_transform(fd[categorical_cols])
# Replace missing values in date columns with most frequent date
    for col in date_cols:
        most_frequent_date = fd[col].mode().values[0]
        fd[col].fillna(most_frequent_date, inplace=True)
        # Impute missing values with median for numerical variables
        numerical_imputer = SimpleImputer(strategy='most_frequent')
    fd[numerical_cols] = numerical_imputer.fit_transform(fd[numerical_cols])

    return fd

In [7]:
#Runing the function to transform the data
transformed_data =replace_nan(fd)

In [8]:
#Function to extract only sales 
import pandas as pd

def remove_adjustment_refund_rows(data):
    # Filter out rows with 'Adjustment' and 'Refund' transaction types
    filtered_data = fd[fd['Transaction Type'].isin(['Sale'])]
    
    return filtered_data

# Example usage:
filtered_data = remove_adjustment_refund_rows(transformed_data)
fd = filtered_data

## **KNN Nieghbor model for predicting product sales**

### **Validation Split**

In [9]:
#Split X and y
X = fd.drop(columns='Product Name')
y = fd['Product Name']

In [10]:
#Split training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [14]:
#Imputers
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy= 'mean')

#Scaler
scaler = StandardScaler()

#OneHotEncoder
ohe =OneHotEncoder(handle_unknown='ignore', sparse=True)

#Create Selectors
cat_selector = make_column_selector(dtype_include= 'object')
num_selector = make_column_selector(dtype_include='number')

# Numeric pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

#cat pipe
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe

#Make tuples for preprocessing the categorical and numeric columns
num_tuple = (numeric_pipe, num_selector)
cat_tuple = (categorical_pipe, cat_selector)          

In [15]:
#Create Column Transformer
preprocessor= make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')

In [16]:
# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [17]:
preprocessor

In [18]:
# Create a decision tree classifier
clf = DecisionTreeClassifier()

In [19]:
# Train the model using the preprocessed training sets
clf.fit(X_train_preprocessed, y_train)

In [20]:
# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Convert the predictions back to the original target labels
y_pred_labels = pd.get_dummies(y_pred).idxmax(axis=1)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

ValueError: could not convert string to float: 'Sale'