In [1]:
# hides all warnings
import warnings
warnings.filterwarnings('ignore')


In [3]:
# imports
import pandas as pd
import utils

In [4]:
# read dataset
df = pd.read_csv('./Iris.csv')

In [5]:
# columns
print("\n*** Columns ***")
print(df.columns)


*** Columns ***
Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')


In [6]:
 #info
print("\n*** Structure ***")
print(df.info())


*** Structure ***
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None


In [7]:
# summary
print("\n*** Summary ***")
print(df.describe())



*** Summary ***
               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000      2.000000       1.000000      0.100000
25%     38.250000       5.100000      2.800000       1.600000      0.300000
50%     75.500000       5.800000      3.000000       4.350000      1.300000
75%    112.750000       6.400000      3.300000       5.100000      1.800000
max    150.000000       7.900000      4.400000       6.900000      2.500000


In [8]:
# head
print("\n*** Head ***")
print(df.head())


*** Head ***
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [10]:
 #drop Id
# change as required
print("\n*** Transformation ***")
df = df.drop('Id', axis=1)


*** Transformation ***


In [11]:
# store class variable  
# change as required
clsVars = "Species"
print("Done ...")

Done ...


In [12]:
# counts
print("\n*** Counts ***")
print(df.groupby(df[clsVars]).size())


*** Counts ***
Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64


In [13]:
# get unique Species names
print("\n*** Unique Species - Categoric Alpha***")
lnLabels = df[clsVars].unique()
print(lnLabels)


*** Unique Species - Categoric Alpha***
['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [14]:
# convert string / categoric to numeric
print("\n*** Unique Species - Categoric Numeric ***")
df[clsVars] = pd.Categorical(df[clsVars])
df[clsVars] = df[clsVars].cat.codes
lnCCodes = df[clsVars].unique()
print(lnCCodes)


*** Unique Species - Categoric Numeric ***
[0 1 2]


In [15]:
# master df
dfm = df.copy()


In [20]:
#Data Normalization
# Normalization refers to rescaling real valued numeric attributes into the range 0 and 1.
# It is useful to scale the input attributes for a model that relies on the 
# magnitude of values, such as distance measures used in k-nearest neighbors 
# and in the preparation of coefficients in regression.
#
# The MinMaxScaler transforms features by scaling each feature to a given range. 
# This range can be set by specifying the feature_range parameter (default at (0,1))
#
# manually
#normalized = (x-min(x))/(max(x)-min(x))

In [19]:
# preparing for normalization / standadrization
df = dfm.copy()

In [21]:
# check variance
print('\n*** Variance In Columns ***')
print(df.var())



*** Variance In Columns ***
SepalLengthCm    0.685694
SepalWidthCm     0.188004
PetalLengthCm    3.113179
PetalWidthCm     0.582414
Species          0.671141
dtype: float64


In [22]:
# check std dev
print('\n*** StdDev In Columns ***')
print(df.std())



*** StdDev In Columns ***
SepalLengthCm    0.828066
SepalWidthCm     0.433594
PetalLengthCm    1.764420
PetalWidthCm     0.763161
Species          0.819232
dtype: float64


In [23]:
# normalize data
print('\n*** Normalize Data ***')
df = utils.NormalizeData(df, clsVars)
print('Done ...')



*** Normalize Data ***
Done ...


In [24]:
# check variance
print('*** Variance In Columns ***')
print(df.var())

*** Variance In Columns ***
SepalLengthCm    0.052908
SepalWidthCm     0.032640
PetalLengthCm    0.089433
PetalWidthCm     0.101114
Species          0.671141
dtype: float64


In [25]:
# check std dev
print('\n*** StdDev In Columns ***')
print(df.std())



*** StdDev In Columns ***
SepalLengthCm    0.230018
SepalWidthCm     0.180664
PetalLengthCm    0.299054
PetalWidthCm     0.317984
Species          0.819232
dtype: float64


In [26]:
# Data Standardization
# Standardization refers to shifting the distribution of each attribute to have 
# a mean of zero and a standard deviation of one (unit variance).
# 
# It is useful to standardize attributes for a model that relies on the 
# distribution of attributes such as Gaussian processes.
#
# Sklearn its main scaler, the StandardScaler, uses a strict definition of standardization 
# to standardize data. It purely centers the data by using the below formula
#
# manually
# standardized = (x-mean(x))/stdev(x)

# preparing for normalization / standadrization
df = dfm.copy()

In [27]:
# check variance
print('\n*** Variance In Columns ***')
print(df.var())



*** Variance In Columns ***
SepalLengthCm    0.685694
SepalWidthCm     0.188004
PetalLengthCm    3.113179
PetalWidthCm     0.582414
Species          0.671141
dtype: float64


In [28]:
# check std dev
print('\n*** StdDev In Columns ***')
print(df.std())


*** StdDev In Columns ***
SepalLengthCm    0.828066
SepalWidthCm     0.433594
PetalLengthCm    1.764420
PetalWidthCm     0.763161
Species          0.819232
dtype: float64


In [29]:
# standardize data
print('\n*** Standardize Data ***')
df = utils.StandardizeData(df, clsVars)
print('Done ...')


*** Standardize Data ***
Done ...


In [30]:
# check variance
print('*** Variance In Columns ***')
print(df.var())


*** Variance In Columns ***
SepalLengthCm    1.006711
SepalWidthCm     1.006711
PetalLengthCm    1.006711
PetalWidthCm     1.006711
Species          0.671141
dtype: float64


In [31]:
# check std dev
print('\n*** StdDev In Columns ***')
print(df.std())


*** StdDev In Columns ***
SepalLengthCm    1.003350
SepalWidthCm     1.003350
PetalLengthCm    1.003350
PetalWidthCm     1.003350
Species          0.819232
dtype: float64


In [32]:
# Absoulute Scaled
# The MaxAbsScaler works very similarly to the MinMaxScaler but automatically 
# scales the data to a [-1,1] range based on the absolute maximum. This scaler 
# is meant for data that is already centered at zero or sparse data. It does not 
# shift/center the data, and thus does not destroy any sparsity.

# manually
# scaled = x / max(abs(x))

# preparing for normalization / standadrization
df = dfm.copy()

In [33]:
# check variance
print('\n*** Variance In Columns ***')
print(df.var())


*** Variance In Columns ***
SepalLengthCm    0.685694
SepalWidthCm     0.188004
PetalLengthCm    3.113179
PetalWidthCm     0.582414
Species          0.671141
dtype: float64


In [34]:
# check std dev
print('\n*** StdDev In Columns ***')
print(df.std())


*** StdDev In Columns ***
SepalLengthCm    0.828066
SepalWidthCm     0.433594
PetalLengthCm    1.764420
PetalWidthCm     0.763161
Species          0.819232
dtype: float64


In [35]:
# MaxAbsScaledData
print('\n*** MaxAbsScaledData Data ***')
df = utils.MaxAbsScaledData(df, clsVars)
print('Done ...')


*** MaxAbsScaledData Data ***
Done ...


In [36]:
# check variance
print('*** Variance In Columns ***')
print(df.var())

*** Variance In Columns ***
SepalLengthCm    0.010987
SepalWidthCm     0.009711
PetalLengthCm    0.065389
PetalWidthCm     0.093186
Species          0.671141
dtype: float64


In [37]:
# check std dev
print('\n*** StdDev In Columns ***')
print(df.std())



*** StdDev In Columns ***
SepalLengthCm    0.104818
SepalWidthCm     0.098544
PetalLengthCm    0.255713
PetalWidthCm     0.305264
Species          0.819232
dtype: float64
