In [47]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [6]:
original_train = data = pd.read_csv("../Data/train.csv")

In [24]:
original_train.sample(5)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
31044,31044,0.31,Premium,G,VS2,62.4,59.0,4.38,4.34,2.72,6.548
9435,9435,0.23,Very Good,F,VVS2,62.9,57.0,3.9,3.95,2.47,6.225
8189,8189,1.2,Ideal,J,SI1,61.6,57.0,6.79,6.87,4.21,8.483
32193,32193,0.51,Very Good,E,VS2,61.4,56.0,5.15,5.18,3.17,7.385
32773,32773,0.43,Ideal,G,IF,61.2,57.0,4.81,4.86,2.96,7.268


- id: only for test & sample submission files, id for prediction sample identification
- price: price in USD
- carat: weight of the diamond
- cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
- color: diamond colour
- clarity: a measurement of how clear the diamond is
- x: length in mm
- y: width in mm
- z: depth in mm
- depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
- table: width of top of diamond relative to widest point (43--95)

#### First look at the dataset

We can see we have at least a few categorical variables (color and cut),  clarity looks also categorical, let´s take a look at their values

In [10]:
original_train["clarity"].value_counts()

SI1     9867
VS2     9243
SI2     6806
VS1     6108
VVS2    3848
VVS1    2727
IF      1305
I1       551
Name: clarity, dtype: int64

In [11]:
original_train["cut"].value_counts()

Ideal        16154
Premium      10369
Very Good     9054
Good          3663
Fair          1215
Name: cut, dtype: int64

In [12]:
original_train["color"].value_counts()

G    8477
E    7370
F    7143
H    6196
D    5113
I    4102
J    2054
Name: color, dtype: int64

We have 3 categorical values. Let´s deal with them using the categorical encodings we´ve learnt in class. 

We are going to do a one hot encoding for the color variable, becuase its the only categorical with nominal variables. For cut and clarity we will use some ordinal encoding techniques since they are ordinal variables

#### One hot encoding 

In [16]:
#pd.get_dummies(original_train, columns=["color"], drop_first = True)

In [18]:
#original_train2.sample(1)

Unnamed: 0,id,carat,cut,clarity,depth,table,x,y,z,price,color_E,color_F,color_G,color_H,color_I,color_J
11978,11978,0.32,Ideal,VVS2,61.9,57.0,4.41,4.38,2.72,6.842,0,0,1,0,0,0


#### Ordinal encoding

For this encoding we could use the OrdinalEncoder from sklearn but I think its more appropiate to use pandas

After conducting some research on the matter, looks like color is also a ordinal variable. This means ordinal encoding is the correct method instead of the One hot encoding I thought at first. https://www.gia.edu/diamond-quality-factor#:~:text=Flawless%20is%20the%20top%20grade,price%20per%20carat%20also%20increases.

In [27]:
color_map = {
    "D": 7,
    "E": 6,
    "F": 5,
    "G": 4,
    "H": 3,
    "I": 2,
    "J": 1
}

In [28]:
original_train["COLOR"] = original_train.color.map(color_map)

Cut is obviously a ordinal variable

In [29]:
cut_map = {
    "Ideal": 5,
    "Premium": 4,
    "Very Good": 3, 
    "Good": 2,          
    "Fair": 1
}      

In [None]:
original_train["CUT"] = original_train.cut.map(cut_map)

Finally, clarity seems to be a ordinal variable as well https://www.americangemsociety.org/page/clarityscale#:~:text=Internally%20Flawless%20diamonds%20have%20no,confined%20to%20the%20surface%20only).&text=A%20diamond%20with%20a%20clarity,to%20see%20under%2010x%20magnification.

In [30]:
clarity_map = {
    "IF": 8,
    "VVS1": 7,
    "VVS2": 6,
    "VS1": 5,
    "VS2": 4,
    "SI1": 3,
    "SI2": 2,
    "I1": 1}

In [31]:
original_train["CLARITY"] = original_train.clarity.map(clarity_map)

In [32]:
original_train

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,CUT,COLOR,CLARITY
0,0,0.53,Very Good,G,SI1,63.4,54.0,5.09,5.13,3.24,7.057,3,4,3
1,1,0.41,Ideal,D,SI1,63.0,56.0,4.80,4.75,3.01,6.824,5,7,3
2,2,0.32,Ideal,I,VS2,61.6,56.0,4.37,4.39,2.70,6.107,5,2,4
3,3,0.31,Ideal,H,VVS2,61.2,56.0,4.34,4.37,2.66,6.390,5,3,6
4,4,1.35,Premium,J,VS2,60.5,56.0,7.19,7.12,4.33,8.741,4,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.52,Premium,D,VS2,61.2,58.0,5.16,5.20,3.17,7.508,4,7,4
40451,40451,0.52,Ideal,F,SI1,62.0,55.0,5.14,5.17,3.19,7.232,5,5,3
40452,40452,0.73,Very Good,D,VS2,63.5,58.0,5.68,5.72,3.62,8.065,3,7,4
40453,40453,0.31,Fair,F,VVS2,56.9,59.0,4.45,4.48,2.54,6.629,1,5,6


Now that we have the categorical values transformed into numerical ordinal values, let´s drop the old columns and the id value that we don´t need

In [39]:
original_train.drop("cut", axis = 1, inplace = True)
original_train.drop("color", axis = 1, inplace = True)
original_train.drop("clarity", axis = 1, inplace = True)
original_train.drop("id", axis = 1, inplace = True)

In [54]:
original_train

Unnamed: 0,carat,depth,table,x,y,z,price,CUT,COLOR,CLARITY
0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,3,4,3
1,0.41,63.0,56.0,4.80,4.75,3.01,6.824,5,7,3
2,0.32,61.6,56.0,4.37,4.39,2.70,6.107,5,2,4
3,0.31,61.2,56.0,4.34,4.37,2.66,6.390,5,3,6
4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,4,1,4
...,...,...,...,...,...,...,...,...,...,...
40450,0.52,61.2,58.0,5.16,5.20,3.17,7.508,4,7,4
40451,0.52,62.0,55.0,5.14,5.17,3.19,7.232,5,5,3
40452,0.73,63.5,58.0,5.68,5.72,3.62,8.065,3,7,4
40453,0.31,56.9,59.0,4.45,4.48,2.54,6.629,1,5,6


#### Missing values

Firstly, let´s look for null values on our data frame.

In [44]:
original_train.columns.isnull()

array([False, False, False, False, False, False, False, False, False,
       False, False])

Perfect! We don´t have Nulls!

### Function 

Let´s introduce all the cleaning process in a function 

In [63]:
def clean_diamonds(data):
    color_map = {
    "D": 7,
    "E": 6,
    "F": 5,
    "G": 4,
    "H": 3,
    "I": 2,
    "J": 1}
    data["COLOR"] = data.color.map(color_map)
    cut_map = {
    "Ideal": 5,
    "Premium": 4,
    "Very Good": 3, 
    "Good": 2,          
    "Fair": 1}
    data["CUT"] = data.cut.map(cut_map)
    clarity_map = {
    "IF": 8,
    "VVS1": 7,
    "VVS2": 6,
    "VS1": 5,
    "VS2": 4,
    "SI1": 3,
    "SI2": 2,
    "I1": 1}
    data["CLARITY"] = data.clarity.map(clarity_map)
    data.drop("cut", axis = 1, inplace = True)
    data.drop("color", axis = 1, inplace = True)
    data.drop("clarity", axis = 1, inplace = True)
    data.drop("id", axis = 1, inplace = True)
    return data

Other way of making our algorithms more precise is using standarization. Some models such as decision trees do not care about data maginitude but some such as KNN neighbors or linear regression with regularization do care. Just in case we use one of this algorithms let´s standarize.

In [55]:
scaler = StandardScaler()

In [56]:
scaler.fit(original_train)

StandardScaler()

In [57]:
original_train

Unnamed: 0,carat,depth,table,x,y,z,price,CUT,COLOR,CLARITY
0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,3,4,3
1,0.41,63.0,56.0,4.80,4.75,3.01,6.824,5,7,3
2,0.32,61.6,56.0,4.37,4.39,2.70,6.107,5,2,4
3,0.31,61.2,56.0,4.34,4.37,2.66,6.390,5,3,6
4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,4,1,4
...,...,...,...,...,...,...,...,...,...,...
40450,0.52,61.2,58.0,5.16,5.20,3.17,7.508,4,7,4
40451,0.52,62.0,55.0,5.14,5.17,3.19,7.232,5,5,3
40452,0.73,63.5,58.0,5.68,5.72,3.62,8.065,3,7,4
40453,0.31,56.9,59.0,4.45,4.48,2.54,6.629,1,5,6


In [58]:
original_train_standard = pd.DataFrame(scaler.transform(original_train), columns = original_train.columns)

In [59]:
original_train_standard

Unnamed: 0,carat,depth,table,x,y,z,price,CUT,COLOR,CLARITY
0,-0.561813,1.148125,-1.547175,-0.566863,-0.523303,-0.416948,-0.714240,-0.809998,-0.242292,-0.640828
1,-0.816215,0.869501,-0.650557,-0.825883,-0.856043,-0.742014,-0.944014,0.981412,1.522760,-0.640828
2,-1.007017,-0.105683,-0.650557,-1.209948,-1.171270,-1.180146,-1.651088,0.981412,-1.418993,-0.031237
3,-1.028217,-0.384307,-0.650557,-1.236743,-1.188783,-1.236679,-1.372006,0.981412,-0.830642,1.187945
4,1.176603,-0.871899,-0.650557,1.308803,1.219204,1.123581,0.946447,0.085707,-2.007344,-0.031237
...,...,...,...,...,...,...,...,...,...,...
40450,-0.583013,-0.384307,0.246062,-0.504340,-0.462009,-0.515881,-0.269483,0.085707,1.522760,-0.031237
40451,-0.583013,0.172941,-1.098866,-0.522204,-0.488278,-0.487614,-0.541662,0.981412,0.346059,-0.640828
40452,-0.137809,1.217781,0.246062,-0.039890,-0.006680,0.120117,0.279806,-0.809998,1.522760,-0.031237
40453,-1.028217,-3.379515,0.694371,-1.138494,-1.092463,-1.406278,-1.136314,-2.601407,0.346059,1.187945


### Function 

Let´s pack this process in a function as well.

In [60]:
def standard(data):
    scaler = StandardScaler()
    scaler.fit(data)
    data = pd.DataFrame(scaler.transform(data), columns = data.columns)
    return data