In [94]:
import numpy
import pandas as pd
from scipy.io import arff
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder

# Infrared Thermography Temperature

In [95]:
infrared_df = pd.read_csv("./infrared/FLIR_groups1and2.csv", header=None)
# Get actual header
infrared_headers = infrared_df.iloc[2]
# Remove Round header and measurements generic term
infrared_df = infrared_df[3:]
infrared_df.columns = infrared_headers

# Drop NaN columns and NaN rows
infrared_df = infrared_df.dropna(axis=1, how='all')
infrared_df = infrared_df.dropna()
# Remove all but one Round and insignificant columns
filtered_columns = [col for col in infrared_df.columns if
                    not col.endswith(('2', '3', '4')) and col not in ["Cosmetics", "Time", "Date", "SubjectID"]]
infrared_df = infrared_df[filtered_columns]

infrared_df = infrared_df.reset_index(drop=True)
infrared_df

2,T_offset1,Max1R13_1,Max1L13_1,aveAllR13_1,aveAllL13_1,T_RC1,T_RC_Dry1,T_RC_Wet1,T_RC_Max1,T_LC1,...,T_OR1,T_OR_Max1,aveOralF,aveOralM,Gender,Age,Ethnicity,T_atm,Humidity,Distance
0,0.17,34.92,35.09,34.53,34.67,34.93,34.92,34.89,34.96,35.08,...,34.13,34.16,36.85,36.64,Female,18-20,Asian,22.5,34.5,0.8
1,0.92,36.12,35.87,35.50,35.27,36.08,36.08,35.68,36.12,35.79,...,34.95,34.98,36.95,37.14,Female,18-20,Hispanic/Latino,23.4,27.3,0.8
2,0.82,35.05,35.05,33.72,34.29,35.14,35.06,35.14,35.15,35.06,...,34.40,34.41,36.65,36.59,Female,18-20,Asian,23.2,27.2,0.8
3,0.88,35.90,35.71,34.99,35.27,35.88,35.88,35.68,35.90,35.65,...,35.90,35.94,37.40,37.39,Male,21-25,White,23.1,27,0.8
4,0.63,34.99,34.43,34.61,33.72,34.97,34.97,34.87,34.99,34.57,...,35.25,35.28,36.65,36.49,Male,>60,Asian,22.8,33,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928,1.15,36.05,36.08,35.17,35.28,36.05,36.04,36.04,36.05,36.11,...,36.12,36.13,36.95,36.99,Female,21-25,Hispanic/Latino,25.7,50.8,0.6
929,1.27,35.51,35.64,34.74,35.08,35.68,35.51,35.68,35.70,35.81,...,35.02,35.04,36.95,36.99,Female,21-25,Asian,25.7,50.8,0.6
930,1.45,35.98,35.84,35.35,35.16,35.97,35.97,35.88,36.01,35.84,...,36.58,36.60,37.25,37.19,Female,21-25,White,25.7,50.8,0.6
931,-0.07,36.36,36.25,35.67,35.76,36.34,36.34,36.31,36.36,36.18,...,35.53,35.57,37.35,37.59,Female,18-20,Black or African-American,28,24.3,0.6


In [96]:
print(infrared_df['Age'].unique())
print(infrared_df['Ethnicity'].unique())
print(infrared_df['Gender'].unique())

['18-20' '21-25' '>60' '26-30' '31-40' '51-60' '41-50' '21-30']
['Asian' 'Hispanic/Latino' 'White' 'Black or African-American'
 'Multiracial' 'American Indian or Alaskan Native']
['Female' 'Male']


In [97]:
infrared_df['Age'] = infrared_df['Age'].replace(['21-25', '26-30'], '21-30')
print(infrared_df['Age'].unique())

['18-20' '21-30' '>60' '31-40' '51-60' '41-50']


In [98]:
feature_mappings = {
    'Age': ['18-20', '21-30', '31-40', '41-50', '51-60', '>60']
}
ordinal_cols = ['Age']
one_hot_cols = ['Ethnicity', 'Gender']
ordinal_categories = [feature_mappings[col] for col in ordinal_cols]

numerical_cols = infrared_df.select_dtypes(include="number").columns
# convert numerical columns to list
numerical_cols = numerical_cols.tolist()

preprocessor = ColumnTransformer([
    ('ordinal', OrdinalEncoder(categories=ordinal_categories), ordinal_cols),
    ('categorical', OneHotEncoder(handle_unknown="ignore"), one_hot_cols),
    ('scaler', MinMaxScaler(), numerical_cols)
],
    remainder='passthrough'  # passthrough columns not listed in any pipeline
)
ordinal_categories

[['18-20', '21-30', '31-40', '41-50', '51-60', '>60']]

In [99]:
# X,y split

# transform dataframe
infrared_df_transformed = preprocessor.fit_transform(infrared_df)
infrared_df_transformed

array([[0.0, 0.0, 1.0, ..., '22.5', '34.5', '0.8'],
       [0.0, 0.0, 0.0, ..., '23.4', '27.3', '0.8'],
       [0.0, 0.0, 1.0, ..., '23.2', '27.2', '0.8'],
       ...,
       [1.0, 0.0, 0.0, ..., '25.7', '50.8', '0.6'],
       [0.0, 0.0, 0.0, ..., '28', '24.3', '0.6'],
       [0.0, 0.0, 0.0, ..., '23.8', '45.6', '0.6']], dtype=object)

# Black Friday

In [100]:
data, meta = arff.loadarff("./black_friday.arff")
black_df = pd.DataFrame(data)

for column in black_df.columns:
    # replace missing values with pd.NA
    black_df[column] = black_df[column].replace(b'?', pd.NA)
    black_df[column] = black_df[column].replace(b'', pd.NA)
    black_df[column] = black_df[column].replace(b'NONE', pd.NA)
    # decode bytes to string for each column
    black_df[column] = black_df[column].apply(lambda x: x.decode() if isinstance(x, bytes) else x)
black_df

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,F,0-17,10.0,A,2,0.0,1.0,6.0,14.0,15200.0
1,M,46-50,7.0,B,2,1.0,1.0,8.0,17.0,19215.0
2,M,26-35,20.0,A,1,1.0,1.0,2.0,5.0,15665.0
3,F,51-55,9.0,A,1,0.0,5.0,8.0,14.0,5378.0
4,F,51-55,9.0,A,1,0.0,2.0,3.0,4.0,13055.0
...,...,...,...,...,...,...,...,...,...,...
166816,F,46-50,0.0,B,4+,1.0,3.0,4.0,12.0,8047.0
166817,M,26-35,6.0,B,2,0.0,6.0,8.0,14.0,16493.0
166818,M,26-35,6.0,B,2,0.0,2.0,3.0,10.0,3425.0
166819,M,26-35,6.0,B,2,0.0,1.0,2.0,15.0,15694.0


In [104]:
print(black_df['Age'].unique())

['0-17' '46-50' '26-35' '51-55' '36-45' '18-25' '55+']


In [106]:
feature_mappings = {
    'Age': ['0-17','18-25','26-35', '36-45', '46-50', '51-55', '55+']
}
ordinal_cols = ['Age']
one_hot_cols = ['City_Category', 'Gender', 'Stay_In_Current_City_Years']
ordinal_categories = [feature_mappings[col] for col in ordinal_cols]

numerical_cols = [col for col in infrared_df.select_dtypes(include="number").columns if col != 'Purchase']

preprocessor = ColumnTransformer([
    ('ordinal', OrdinalEncoder(categories=ordinal_categories), ordinal_cols),
    ('categorical', OneHotEncoder(handle_unknown="ignore"), one_hot_cols),
    ('scaler', MinMaxScaler(), numerical_cols)
],
    remainder='passthrough'  # passthrough columns not listed in any pipeline
)
ordinal_categories

[['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+']]

In [108]:
# X,y split

# transform dataframe
black_df_transformed = preprocessor.fit_transform(black_df)
black_df_transformed

array([[0.0000e+00, 1.0000e+00, 0.0000e+00, ..., 6.0000e+00, 1.4000e+01,
        1.5200e+04],
       [4.0000e+00, 0.0000e+00, 1.0000e+00, ..., 8.0000e+00, 1.7000e+01,
        1.9215e+04],
       [2.0000e+00, 1.0000e+00, 0.0000e+00, ..., 2.0000e+00, 5.0000e+00,
        1.5665e+04],
       ...,
       [2.0000e+00, 0.0000e+00, 1.0000e+00, ..., 3.0000e+00, 1.0000e+01,
        3.4250e+03],
       [2.0000e+00, 0.0000e+00, 1.0000e+00, ..., 2.0000e+00, 1.5000e+01,
        1.5694e+04],
       [2.0000e+00, 0.0000e+00, 1.0000e+00, ..., 2.0000e+00, 1.1000e+01,
        1.1640e+04]])

# Power plant

In [101]:
ccpp_df = pd.read_excel("./CCPP/Folds5x2_pp.xlsx")
ccpp_df

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.40,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.50,1009.23,96.62,473.90
...,...,...,...,...,...
9563,16.65,49.69,1014.01,91.00,460.03
9564,13.19,39.18,1023.67,66.78,469.62
9565,31.32,74.33,1012.92,36.48,429.57
9566,24.48,69.45,1013.86,62.39,435.74


In [102]:
X = ccpp_df.drop('PE')
y = ccpp_df['PE']
# Split for training
ccpp_X_train, ccpp_X_test, ccpp_y_train, ccpp_y_test = train_test_split(X, y, test_size=0.2, random_state=42)

KeyError: "['PE'] not found in axis"

In [103]:
cols_to_scale = remaining_columns = [col for col in ccpp_df.columns if col != "PE"]
preprocessor = ColumnTransformer(
    transformers=[('scaler', MinMaxScaler(), cols_to_scale)],
    remainder='passthrough')

ccpp_df_transformed = preprocessor.fit_transform(ccpp_df)
ccpp_df_transformed

array([[3.72521246e-01, 2.91814947e-01, 7.71591190e-01, 6.38203753e-01,
        4.63260000e+02],
       [6.62039660e-01, 6.69039146e-01, 6.71863400e-01, 4.49329759e-01,
        4.44370000e+02],
       [9.34844193e-02, 2.49822064e-01, 4.76862163e-01, 8.92493298e-01,
        4.88560000e+02],
       ...,
       [8.35977337e-01, 8.71352313e-01, 4.95669389e-01, 1.46380697e-01,
        4.29570000e+02],
       [6.42209632e-01, 7.84519573e-01, 5.18930958e-01, 4.93699732e-01,
        4.35740000e+02],
       [5.60623229e-01, 6.61209964e-01, 6.02326157e-01, 5.67158177e-01,
        4.53280000e+02]])