# 3. Pre-processing and Training Data Development

* [3 Training Data](#2_Data_training_introduction)
  * [3.1 Dummy Variables/One Hot Encoding for Categorical](#2.1_one_hot_encoding)
  * [3.2 Standardize Numerical Data](#3.3_standardize)
  * [3.3 Testing Training](#3.4_testing_training)
 * [3.2 Summary](#3.7_Summary)


III. Split your data into testing and training datasets

## Training Data <a href="#2_Data_training_introduction">

In [13]:
# data manipulation and math
#
import numpy as np
# import scipy as sp
import pandas as pd
#
# plotting and visualization
#
# import matplotlib as mpl
# import matplotlib.cm as cm
# from matplotlib.colors import ListedColormap
# import matplotlib.pyplot as plt
#import seaborn as sns
#
# modeling
#
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.preprocessing import OneHotEncoder as OHE
# import sklearn.model_selection
from sklearn.model_selection import train_test_split
# from sklearn.model_selection import KFold
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, f1_score
# from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import ConfusionMatrixDisplay

print("Loaded Libraries")

Loaded Libraries


### I. Training Data <a href="#2_Data_training_introduction">
Create dummy or indicator features for categorical variables (categories -- one hot encoding)

In [14]:
# Load the data
products = pd.read_csv("../data/processed/products.csv")

# New category list
new_categories = pd.read_csv("../data/processed/Sunlight-Categories.csv")["Category"].tolist()  # Get the list of new categories

# Convert the list to a numpy array
new_categories_array = np.array(new_categories)

# Reshape the array to have a single column
new_categories_array = new_categories_array.reshape(-1, 1)

# One-hot encoding for new categories
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_features = pd.DataFrame(encoder.fit_transform(new_categories_array))

# Encoded features now contain columns for each new category
print(encoded_features)

     0    1    2    3    4    5    6    7    8    9    ...  137  138  139  \
0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
142  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
143  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
144  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
145  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
146  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

     140  141  142  143  144  145  146  
0    0.0  0.0  0.0  0.0  0.0  0.0 



Standardize the magnitude of numeric features using a scaler (MSRP, Cost)

In [31]:
# Select relevant features (optional)
relevant_features = ['Description', 'Size', 'MSRP', 'Category'] + list(encoded_features.columns)
features = pd.concat([pd.DataFrame(features).reset_index(drop=True), encoded_features.reset_index(drop=True)], axis=1)

# Define placeholder target column or based on some criteria
# For example, let's say you want to predict whether a product is popular or not based on its MSRP
# You can define the target column like this:
products['NEWCategory'] = products['Category']

# Define the target variable
target = products['NEWCategory']

# Standardize numeric features (if any)
scaler = StandardScaler()

# Extract numerical columns
numerical_cols = features[['MSRP', 'Size']]  # Only select 'MSRP' and 'Size' columns

# Scale numerical features
scaled_numerical_cols = pd.DataFrame(scaler.fit_transform(numerical_cols), columns=numerical_cols.columns)

# Combine scaled numerical columns with non-scaled features
scaled_features = pd.concat([scaled_numerical_cols, features.drop(['MSRP', 'Size'], axis=1)], axis=1)

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(scaled_features, target, test_size=0.2, random_state=42)

# Now you have:
# - X_train: Training features (scaled)
# - X_test: Testing features (scaled)
# - y_train: Training target variable
# - y_test: Testing target variable

# You can use these for model training and evaluation


KeyError: "None of [Index(['MSRP', 'Size'], dtype='object')] are in the [columns]"

## Categorical Variables

In general, categorical features need to be transformed or encoded to be used in some machine learning models, as is the case for Logistic Regression. A common transformation is so-called dummy encoding, where each possible value of a feature becomes a new column, and a 1 is placed in that column if the data instance (a row of the data) contained that value, and a 0 is placed in that column otherwise.

For example, suppose we had a column in a hypothetical data set called species, and it contained one of two values, "cat" or "dog". The column might look like this:

In [None]:
dflog = pd.read_csv("../data/processed/products.csv")

In [None]:
categorical_features = ['Brand',
                        'Size',
                        'Color', 
                        'Description']
'ParentCategory'
'Category'
dflog = pd.get_dummies(dflog, columns = categorical_features)
print('The data have ', dflog.shape[0], ' rows and ', dflog.shape[1], ' columns\n')
print('column names: \n')
print('\n'.join(list(dflog.columns)))

#### Proportion of classes
When building classification models, it is always a good idea to know right away the number of samples per class, proportionally to the total number of samples. First we get the counts of each class.

In [None]:
class_counts = dflog['Category'].value_counts()
class_counts

In [None]:
class_percentages = pd.Series([(x / dflog.shape[0]) * 100.00 for x in class_counts])
class_percentages

In [None]:
# fig, ax = plt.subplots()
# ax.bar(class_counts.index, class_counts)
# ax.set_xticks([0, 1])
# ax.set_xticklabels(class_percentages.index.astype(str) + '\n' + ' ' +
#                    class_percentages.round(0).astype(str) + '%')
# ax.set_ylabel('Count')
# ax.set_xlabel('Category')
# ax.set_title('Heart Disease class distribution\nwhere 1 means presence of heart disease',
#               fontsize = 10)
# plt.show()

Are there Imbalanced Multi-Class Classification Problems--IMCP going on here?? 

In [None]:
def points_plot(ax, Xtr, Xte, ytr, yte, 
                clf, 
                mesh = True, colorscale = cmap_light, cdiscrete = cmap_bold, 
                alpha = 0.1, psize = 10, 
                zfunc = False, predicted = False):
#
# note: this function only works for X.shape = (:, 2)
# it is intended to illustrate the classifier boundary
#
# get the column names if they exist to apply
# to the meshed data generated below
#
    try:
        feature_names = Xtr.columns
    except:
        feature_names = None
#        
    Xtrain = np.array(Xtr)
    Xtest = np.array(Xte)
#
    h = 0.02
#
# create a uniform grid spanning the range of the X values
# note that y here is NOT the target, it is the 2nd
# dimension of the desired plot
#
    X = np.concatenate((Xtrain, Xtest))
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
#
# predict the target value at each point in the grid
#
# this method uses the probabilities from the classifier
# and applies a function to determine the class
#
    if zfunc:
        mesh_data = np.c_[xx.ravel(), yy.ravel()]
        if feature_names is not None:
            mesh_data = pd.DataFrame(mesh_data, 
                         columns = feature_names)
        p0 = clf.predict_proba(mesh_data)[:, 0]
        p1 = clf.predict_proba(mesh_data)[:, 1]
        Z = zfunc(p0, p1)
#
# this method uses the classifier to predict the classes directly
#
    else:
        mesh_data = np.c_[xx.ravel(), yy.ravel()]
        if feature_names is not None:
            mesh_data = pd.DataFrame(mesh_data, 
                                     columns = feature_names)
        Z = clf.predict(mesh_data)
    ZZ = Z.reshape(xx.shape)
#
# plt.pcolormesh() creates a shaded result over the grid
#
    if mesh:
        plt.pcolormesh(xx, yy, ZZ, 
                       cmap = cmap_light, 
                       alpha = alpha, 
                       axes = ax, 
                       shading = 'auto')
#
# add the points to the plot
# these can be the original target values
# or the predicted values
#
    if predicted:
        showtr = clf.predict(Xtr)
        showte = clf.predict(Xte)
    else:
        showtr = ytr
        showte = yte
#
# plot training points
#
    ax.scatter(Xtrain[:, 0], Xtrain[:, 1], 
               c = showtr - 1, 
               cmap = cmap_bold, 
               s = psize, 
               alpha = alpha, 
               edgecolor = "k")
#    
# plot testing points
#
    ax.scatter(Xtest[:, 0], Xtest[:, 1],
               c = showte - 1, 
               cmap = cmap_bold, 
               s = psize + 10,
               alpha = alpha, 
               marker = "s")
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
#
    return ax, xx, yy

In [None]:
def points_plot_prob(ax, Xtr, Xte, ytr, yte, 
                     clf, colorscale = cmap_light, cdiscrete = cmap_bold, 
                     ccolor = cm, 
                     alpha = 0.1, psize = 10):
    try:
        feature_names = Xtr.columns
    except:
        feature_names = None
#        
    Xtrain = np.array(Xtr)
    Xtest = np.array(Xte)
#    
    ax, xx, yy = points_plot(ax, Xtr, Xte, ytr, yte,
                         clf,
                         mesh = False, 
                         colorscale = colorscale, cdiscrete = cdiscrete, 
                         psize = psize, alpha = alpha,
                         predicted = True) 
    mesh_data = np.c_[xx.ravel(), yy.ravel()]
    if feature_names is not None:
        mesh_data = pd.DataFrame(mesh_data, 
                     columns = feature_names)    
    Z = clf.predict_proba(mesh_data)[:, 1]
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap = ccolor, alpha = 0.2)
    cs2 = plt.contour(xx, yy, Z, cmap = ccolor, alpha = 0.6)
    plt.clabel(cs2, fmt = '%2.1f', colors = 'k', fontsize = 12)
#
    plt.show()