### _Imports_

In [1]:
import pandas as pd
import numpy as np

### Create DataFrame from csv

In [2]:
diamonds = pd.read_csv('../data/diamonds_train.csv')
diamonds.head()

Unnamed: 0,price,carat,city,depth,table,x,y,z,cut,color,clarity
0,4268,1.21,Dubai,62.4,58.0,6.83,6.79,4.25,Premium,J,VS2
1,505,0.32,Kimberly,63.0,57.0,4.35,4.38,2.75,Very Good,H,VS2
2,2686,0.71,Las Vegas,65.5,55.0,5.62,5.53,3.65,Fair,G,VS1
3,738,0.41,Kimberly,63.8,56.0,4.68,4.72,3.0,Good,D,SI1
4,4882,1.02,Dubai,60.5,59.0,6.55,6.51,3.95,Ideal,G,SI1


In [3]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   city     40455 non-null  object 
 3   depth    40455 non-null  float64
 4   table    40455 non-null  float64
 5   x        40455 non-null  float64
 6   y        40455 non-null  float64
 7   z        40455 non-null  float64
 8   cut      40455 non-null  object 
 9   color    40455 non-null  object 
 10  clarity  40455 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.4+ MB


There are no nulls, therefore it is not necesary to manage missing values and we can start directly with enconding.

### Intro to encoding: categorical variables overview

In [4]:
# Defining a function for visualization:

def cat_var(df, cols):
    '''
    Return: a Pandas dataframe object with the following columns:
        - "categorical_variable" => every categorical variable include as an input parameter (string).
        - "number_of_possible_values" => the amount of unique values that can take a given categorical variable (integer).
        - "values" => a list with the posible unique values for every categorical variable (list).

    Input parameters:
        - df -> Pandas dataframe object: a dataframe with categorical variables.
        - cols -> list object: a list with the name (string) of every categorical variable to analyse.
    '''
    cat_list = []
    for col in cols:
        cat = df[col].unique()
        cat_num = len(cat)
        cat_dict = {"categorical_variable":col,
                    "number_of_possible_values":cat_num,
                    "values":cat}
        cat_list.append(cat_dict)
    df = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values", ascending=False)
    return df.reset_index(drop=True)

In [5]:
cat_var_lst = ["city", "cut", "color", "clarity"]
df_cat_var = cat_var(diamonds, cat_var_lst)
df_cat_var

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,city,13,"[Dubai, Kimberly, Las Vegas, Tel Aviv, Amsterd..."
1,clarity,8,"[VS2, VS1, SI1, SI2, IF, VVS1, VVS2, I1]"
2,color,7,"[J, H, G, D, F, E, I]"
3,cut,5,"[Premium, Very Good, Fair, Good, Ideal]"


## Pre-processing I: baseline

### One hot encoding

In [12]:
cat_vars = diamonds[cat_var_lst]
non_cat_vars = diamonds.drop(cat_var_lst, axis=1)

In [13]:
cat_vars_encoded = pd.get_dummies(cat_vars, drop_first=True, dtype=int)

In [14]:
baseline_train = pd.concat([non_cat_vars, cat_vars_encoded], axis=1)

In [15]:
baseline_train.head()

Unnamed: 0,price,carat,depth,table,x,y,z,city_Antwerp,city_Dubai,city_Kimberly,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,4268,1.21,62.4,58.0,6.83,6.79,4.25,0,1,0,...,0,0,1,0,0,0,0,1,0,0
1,505,0.32,63.0,57.0,4.35,4.38,2.75,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,2686,0.71,65.5,55.0,5.62,5.53,3.65,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,738,0.41,63.8,56.0,4.68,4.72,3.0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,4882,1.02,60.5,59.0,6.55,6.51,3.95,0,1,0,...,0,0,0,0,1,0,0,0,0,0


Checking that all features are numeric

In [16]:
baseline_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   price               40455 non-null  int64  
 1   carat               40455 non-null  float64
 2   depth               40455 non-null  float64
 3   table               40455 non-null  float64
 4   x                   40455 non-null  float64
 5   y                   40455 non-null  float64
 6   z                   40455 non-null  float64
 7   city_Antwerp        40455 non-null  int64  
 8   city_Dubai          40455 non-null  int64  
 9   city_Kimberly       40455 non-null  int64  
 10  city_Las Vegas      40455 non-null  int64  
 11  city_London         40455 non-null  int64  
 12  city_Luxembourg     40455 non-null  int64  
 13  city_Madrid         40455 non-null  int64  
 14  city_New York City  40455 non-null  int64  
 15  city_Paris          40455 non-null  int64  
 16  city

### Save pre-processing

In [17]:
baseline_train.to_csv('../data/baseline_train.csv', index=False)