In [1]:
# imports

import numpy as np
import pandas as pd

import pickle

# matplotlib
import matplotlib.pyplot as plt   # conda install matplotlib

# seaborn
import seaborn as sns  

pd.options.mode.copy_on_write = True 
# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from collections import Counter
from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [2]:
# loading a csv.
# Cargo el csv.

diamonds= pd.read_csv('./data/diamonds_train.csv')
diamonds

Unnamed: 0.1,Unnamed: 0,price,carat,city,color,cut,clarity,depth,table,x,y,z
0,0,4268,1.21,Dubai,J,Premium,VS2,62.4,58.0,6.83,6.79,4.25
1,1,505,0.32,Kimberly,H,Very Good,VS2,63.0,57.0,4.35,4.38,2.75
2,2,2686,0.71,Las Vegas,G,Fair,VS1,65.5,55.0,5.62,5.53,3.65
3,3,738,0.41,Kimberly,D,Good,SI1,63.8,56.0,4.68,4.72,3.00
4,4,4882,1.02,Dubai,G,Ideal,SI1,60.5,59.0,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,10070,1.34,Antwerp,G,Ideal,VS1,62.7,57.0,7.10,7.04,4.43
40451,40451,12615,2.02,Madrid,F,Good,SI2,57.1,60.0,8.31,8.25,4.73
40452,40452,5457,1.01,Kimberly,H,Ideal,SI1,62.7,56.0,6.37,6.42,4.01
40453,40453,456,0.33,Kimberly,J,Ideal,VS1,61.9,54.3,4.45,4.47,2.76


In [3]:
# I review the information in all columns.
# Reviso la información de todas las columnas.

diamonds.describe()

Unnamed: 0.1,Unnamed: 0,price,carat,depth,table,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,20227.0,3928.444469,0.797706,61.752841,57.446133,5.729392,5.732819,3.537154
std,11678.496907,3992.416147,0.475544,1.431725,2.233535,1.124453,1.14665,0.697062
min,0.0,326.0,0.2,43.0,43.0,0.0,0.0,0.0
25%,10113.5,945.0,0.4,61.0,56.0,4.71,4.72,2.91
50%,20227.0,2397.0,0.7,61.8,57.0,5.69,5.71,3.52
75%,30340.5,5331.0,1.04,62.5,59.0,6.54,6.54,4.035
max,40454.0,18823.0,4.5,79.0,95.0,10.23,58.9,8.06


In [4]:
# I check for rows with nulls that exist in each column.
# Verifico las filas con nulos que existen en cada columna. 

diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  40455 non-null  int64  
 1   price       40455 non-null  int64  
 2   carat       40455 non-null  float64
 3   city        40455 non-null  object 
 4   color       40455 non-null  object 
 5   cut         40455 non-null  object 
 6   clarity     40455 non-null  object 
 7   depth       40455 non-null  float64
 8   table       40455 non-null  float64
 9   x           40455 non-null  float64
 10  y           40455 non-null  float64
 11  z           40455 non-null  float64
dtypes: float64(6), int64(2), object(4)
memory usage: 3.7+ MB


In [5]:
# I start cleaning the data. Since I don't have any null data, I start processing the categorical columns of the dataset. I do label encoding with the data.
# Empiezo a limpiar los datos. Como no tengo ningún dato nulo, empiezo a tratas las columnas categóricas del dataset. Hago un label encoding con los datos.

cols = ['city', 'color', 'cut', 'clarity']

diamonds_encoded = diamonds[cols]


cat_list = []
for col in cols:
    cat = diamonds[col].unique()
    cat_num = len(cat)
    cat_dict = {"categorical_variable":col,
                "number_of_possible_values":cat_num,
                "values":cat}
    cat_list.append(cat_dict)
    
categories = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values",
                                                ascending=False).reset_index(drop=True)
categories

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,city,13,"[Dubai, Kimberly, Las Vegas, Tel Aviv, Amsterd..."
1,clarity,8,"[VS2, VS1, SI1, SI2, IF, VVS1, VVS2, I1]"
2,color,7,"[J, H, G, D, F, E, I]"
3,cut,5,"[Premium, Very Good, Fair, Good, Ideal]"


In [6]:
categories['values']

0    [Dubai, Kimberly, Las Vegas, Tel Aviv, Amsterd...
1             [VS2, VS1, SI1, SI2, IF, VVS1, VVS2, I1]
2                                [J, H, G, D, F, E, I]
3              [Premium, Very Good, Fair, Good, Ideal]
Name: values, dtype: object

In [7]:
encoding = {'Premium':1, 'Very Good':2, 'Fair':3, 'Good':4, 'Ideal':5}
def ordinal_encoding(x):
    for key in encoding:
        if x == key:
            return encoding[key]

diamonds_encoded['cut_num'] = diamonds_encoded['cut'].apply(ordinal_encoding)
diamonds['cut'] = diamonds_encoded['cut'].apply(ordinal_encoding)

#diamonds_encoded
diamonds

Unnamed: 0.1,Unnamed: 0,price,carat,city,color,cut,clarity,depth,table,x,y,z
0,0,4268,1.21,Dubai,J,1,VS2,62.4,58.0,6.83,6.79,4.25
1,1,505,0.32,Kimberly,H,2,VS2,63.0,57.0,4.35,4.38,2.75
2,2,2686,0.71,Las Vegas,G,3,VS1,65.5,55.0,5.62,5.53,3.65
3,3,738,0.41,Kimberly,D,4,SI1,63.8,56.0,4.68,4.72,3.00
4,4,4882,1.02,Dubai,G,5,SI1,60.5,59.0,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,10070,1.34,Antwerp,G,5,VS1,62.7,57.0,7.10,7.04,4.43
40451,40451,12615,2.02,Madrid,F,4,SI2,57.1,60.0,8.31,8.25,4.73
40452,40452,5457,1.01,Kimberly,H,5,SI1,62.7,56.0,6.37,6.42,4.01
40453,40453,456,0.33,Kimberly,J,5,VS1,61.9,54.3,4.45,4.47,2.76


In [8]:

encoding = {'VS2':1, 'VS1':2, 'SI1':3, 'SI2':4, 'IF':5, 'VVS1':6, 'VVS2':7, 'I1':8}
def ordinal_encoding(x):
    for key in encoding:
        if x == key:
            return encoding[key]

diamonds_encoded['clarity_num'] = diamonds_encoded['clarity'].apply(ordinal_encoding)
diamonds['clarity'] = diamonds_encoded['clarity'].apply(ordinal_encoding)

#diamonds_encoded
diamonds

Unnamed: 0.1,Unnamed: 0,price,carat,city,color,cut,clarity,depth,table,x,y,z
0,0,4268,1.21,Dubai,J,1,1,62.4,58.0,6.83,6.79,4.25
1,1,505,0.32,Kimberly,H,2,1,63.0,57.0,4.35,4.38,2.75
2,2,2686,0.71,Las Vegas,G,3,2,65.5,55.0,5.62,5.53,3.65
3,3,738,0.41,Kimberly,D,4,3,63.8,56.0,4.68,4.72,3.00
4,4,4882,1.02,Dubai,G,5,3,60.5,59.0,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,10070,1.34,Antwerp,G,5,2,62.7,57.0,7.10,7.04,4.43
40451,40451,12615,2.02,Madrid,F,4,4,57.1,60.0,8.31,8.25,4.73
40452,40452,5457,1.01,Kimberly,H,5,3,62.7,56.0,6.37,6.42,4.01
40453,40453,456,0.33,Kimberly,J,5,2,61.9,54.3,4.45,4.47,2.76


In [9]:

encoding = {'J':1, 'H':2, 'G':3, 'D':4, 'F':5, 'E':6, 'I':7}
def ordinal_encoding(x):
    for key in encoding:
        if x == key:
            return encoding[key]

diamonds_encoded['color_num'] = diamonds_encoded['color'].apply(ordinal_encoding)
diamonds['color'] = diamonds_encoded['color'].apply(ordinal_encoding)

#diamonds_encoded
diamonds


Unnamed: 0.1,Unnamed: 0,price,carat,city,color,cut,clarity,depth,table,x,y,z
0,0,4268,1.21,Dubai,1,1,1,62.4,58.0,6.83,6.79,4.25
1,1,505,0.32,Kimberly,2,2,1,63.0,57.0,4.35,4.38,2.75
2,2,2686,0.71,Las Vegas,3,3,2,65.5,55.0,5.62,5.53,3.65
3,3,738,0.41,Kimberly,4,4,3,63.8,56.0,4.68,4.72,3.00
4,4,4882,1.02,Dubai,3,5,3,60.5,59.0,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,10070,1.34,Antwerp,3,5,2,62.7,57.0,7.10,7.04,4.43
40451,40451,12615,2.02,Madrid,5,4,4,57.1,60.0,8.31,8.25,4.73
40452,40452,5457,1.01,Kimberly,2,5,3,62.7,56.0,6.37,6.42,4.01
40453,40453,456,0.33,Kimberly,1,5,2,61.9,54.3,4.45,4.47,2.76


In [10]:
# I want to know the name of the cities of the colum "city".
# Quiero saber las ciudades existentes en la columna "city".

diamonds['city'].unique()

array(['Dubai', 'Kimberly', 'Las Vegas', 'Tel Aviv', 'Amsterdam',
       'Zurich', 'Antwerp', 'Madrid', 'Paris', 'Surat', 'Luxembourg',
       'London', 'New York City'], dtype=object)

In [11]:

encoding = {'Dubai':1, 'Kimberly':1, 'Las Vegas':1, 'Tel Aviv':1, 'Amsterdam':1,
       'Zurich':1, 'Antwerp':1, 'Madrid':1, 'Paris':1, 'Surat':1, 'Luxembourg':1,
       'London':1, 'New York City':1}
def ordinal_encoding(x):
    for key in encoding:
        if x == key:
            return encoding[key]

diamonds_encoded['city_num'] = diamonds_encoded['city'].apply(ordinal_encoding)
diamonds['city'] = diamonds_encoded['city'].apply(ordinal_encoding)

#diamonds_encoded
diamonds


Unnamed: 0.1,Unnamed: 0,price,carat,city,color,cut,clarity,depth,table,x,y,z
0,0,4268,1.21,1,1,1,1,62.4,58.0,6.83,6.79,4.25
1,1,505,0.32,1,2,2,1,63.0,57.0,4.35,4.38,2.75
2,2,2686,0.71,1,3,3,2,65.5,55.0,5.62,5.53,3.65
3,3,738,0.41,1,4,4,3,63.8,56.0,4.68,4.72,3.00
4,4,4882,1.02,1,3,5,3,60.5,59.0,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,10070,1.34,1,3,5,2,62.7,57.0,7.10,7.04,4.43
40451,40451,12615,2.02,1,5,4,4,57.1,60.0,8.31,8.25,4.73
40452,40452,5457,1.01,1,2,5,3,62.7,56.0,6.37,6.42,4.01
40453,40453,456,0.33,1,1,5,2,61.9,54.3,4.45,4.47,2.76


In [12]:
diamonds_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   city         40455 non-null  object
 1   color        40455 non-null  object
 2   cut          40455 non-null  object
 3   clarity      40455 non-null  object
 4   cut_num      40455 non-null  int64 
 5   clarity_num  40455 non-null  int64 
 6   color_num    40455 non-null  int64 
 7   city_num     40455 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 2.5+ MB


In [13]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  40455 non-null  int64  
 1   price       40455 non-null  int64  
 2   carat       40455 non-null  float64
 3   city        40455 non-null  int64  
 4   color       40455 non-null  int64  
 5   cut         40455 non-null  int64  
 6   clarity     40455 non-null  int64  
 7   depth       40455 non-null  float64
 8   table       40455 non-null  float64
 9   x           40455 non-null  float64
 10  y           40455 non-null  float64
 11  z           40455 non-null  float64
dtypes: float64(6), int64(6)
memory usage: 3.7 MB


In [14]:
#diamonds.drop(['city', 'color', 'clarity', 'cut'], axis='columns', inplace=True)

In [15]:
# I verify that the dataset has the new data saved with the label encoding.
# Verifico que el dataset este con los nuevos datos guardados con el label encoding.

diamonds

Unnamed: 0.1,Unnamed: 0,price,carat,city,color,cut,clarity,depth,table,x,y,z
0,0,4268,1.21,1,1,1,1,62.4,58.0,6.83,6.79,4.25
1,1,505,0.32,1,2,2,1,63.0,57.0,4.35,4.38,2.75
2,2,2686,0.71,1,3,3,2,65.5,55.0,5.62,5.53,3.65
3,3,738,0.41,1,4,4,3,63.8,56.0,4.68,4.72,3.00
4,4,4882,1.02,1,3,5,3,60.5,59.0,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,10070,1.34,1,3,5,2,62.7,57.0,7.10,7.04,4.43
40451,40451,12615,2.02,1,5,4,4,57.1,60.0,8.31,8.25,4.73
40452,40452,5457,1.01,1,2,5,3,62.7,56.0,6.37,6.42,4.01
40453,40453,456,0.33,1,1,5,2,61.9,54.3,4.45,4.47,2.76


In [16]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  40455 non-null  int64  
 1   price       40455 non-null  int64  
 2   carat       40455 non-null  float64
 3   city        40455 non-null  int64  
 4   color       40455 non-null  int64  
 5   cut         40455 non-null  int64  
 6   clarity     40455 non-null  int64  
 7   depth       40455 non-null  float64
 8   table       40455 non-null  float64
 9   x           40455 non-null  float64
 10  y           40455 non-null  float64
 11  z           40455 non-null  float64
dtypes: float64(6), int64(6)
memory usage: 3.7 MB


In [17]:


# Features + target. Target is "price"
# Funciones + target. Target es "price".

X = diamonds[['carat',
              'city',
              'color',
              'cut',
              'clarity',
              'depth',
              'table',
              'x',
              'y',
              'z']]
y =diamonds['price']
print(X.shape,y.shape)


(40455, 10) (40455,)


In [18]:
# I'm about to train the data. 80% for training and 20% for testing. Train + test
# Me dipongo a entrenar los datos. 80% para entrenar y 20% para test. Entrenamiento + test.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")



X_train: (32364, 10), X_test: (8091, 10), y_train: (32364,), y_test: (8091,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


In [19]:
X_train

Unnamed: 0,carat,city,color,cut,clarity,depth,table,x,y,z
32121,0.52,1,3,2,1,63.2,58.0,5.12,5.10,3.23
9831,1.59,1,4,2,4,59.9,59.0,7.60,7.52,4.53
33128,0.66,1,3,5,3,61.7,55.0,5.64,5.60,3.47
6199,0.38,1,2,5,3,61.2,55.1,4.69,4.73,2.88
19661,0.70,1,3,1,7,61.8,58.0,5.67,5.63,3.49
...,...,...,...,...,...,...,...,...,...,...
6265,0.71,1,2,5,6,62.0,57.0,5.71,5.75,3.55
11284,0.35,1,6,1,1,59.5,58.0,4.62,4.59,2.74
38158,0.23,1,6,2,7,59.4,59.0,4.03,4.08,2.41
860,1.00,1,7,4,3,58.0,58.0,6.56,6.62,3.82


In [20]:
# Model definition.
# Defino el modelo.

model = RandomForestRegressor()
print(type(model))

<class 'sklearn.ensemble._forest.RandomForestRegressor'>


In [21]:
# Model training.
# Entreno el modelo.

weights = model.fit(X_train, y_train)
print(type(weights))

<class 'sklearn.ensemble._forest.RandomForestRegressor'>


In [22]:
# Save model using pickle.
# Guardo el modelo usando pickle.

filename = 'trained_model3.sav'
pickle.dump(model, open(filename, 'wb'))
print('Your model has been saved with  pickle!!!')

Your model has been saved with  pickle!!!
