In [1]:
# imports

import numpy as np
import pandas as pd

# matplotlib
import matplotlib.pyplot as plt   # conda install matplotlib

# seaborn
import seaborn as sns  

pd.options.mode.copy_on_write = True 
# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from collections import Counter
from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import pickle

In [2]:
# loading a csv.
# Cargo el csv.

diamonds= pd.read_csv('./data/diamonds_test.csv')
diamonds


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [3]:
# I review the information in all columns.
# Reviso la información de todas las columnas.

diamonds.describe()

Unnamed: 0,id,carat,depth,table,x,y,z
count,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0
mean,6742.0,0.798642,61.739095,57.490337,5.736454,5.739648,3.543474
std,3892.928525,0.469399,1.43531,2.237109,1.113671,1.128507,0.731005
min,0.0,0.2,50.8,51.0,0.0,0.0,0.0
25%,3371.0,0.4,61.0,56.0,4.73,4.73,2.92
50%,6742.0,0.7,61.9,57.0,5.7,5.72,3.53
75%,10113.0,1.04,62.5,59.0,6.53,6.53,4.04
max,13484.0,5.01,79.0,73.0,10.74,31.8,31.8


In [4]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       13485 non-null  int64  
 1   carat    13485 non-null  float64
 2   cut      13485 non-null  object 
 3   color    13485 non-null  object 
 4   clarity  13485 non-null  object 
 5   depth    13485 non-null  float64
 6   table    13485 non-null  float64
 7   x        13485 non-null  float64
 8   y        13485 non-null  float64
 9   z        13485 non-null  float64
 10  city     13485 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 1.1+ MB


In [5]:
# I start cleaning the data. Since I don't have any null data, I start processing the categorical columns of the dataset. I do label encoding with the data.
# Empiezo a limpiar los datos. Como no tengo ningún dato nulo, empiezo a tratas las columnas categóricas del dataset. Hago un label encoding con los datos.

cols = ['city', 'color', 'cut', 'clarity']

diamonds_encoded = diamonds[cols]


cat_list = []
for col in cols:
    cat = diamonds[col].unique()
    cat_num = len(cat)
    cat_dict = {"categorical_variable":col,
                "number_of_possible_values":cat_num,
                "values":cat}
    cat_list.append(cat_dict)
    
categories = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values",
                                                ascending=False).reset_index(drop=True)
categories

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,city,13,"[Amsterdam, Surat, Kimberly, Paris, Tel Aviv, ..."
1,clarity,8,"[SI1, VS1, VS2, VVS1, SI2, VVS2, IF, I1]"
2,color,7,"[F, J, H, D, I, G, E]"
3,cut,5,"[Very Good, Ideal, Premium, Good, Fair]"


In [6]:
categories['values']

0    [Amsterdam, Surat, Kimberly, Paris, Tel Aviv, ...
1             [SI1, VS1, VS2, VVS1, SI2, VVS2, IF, I1]
2                                [F, J, H, D, I, G, E]
3              [Very Good, Ideal, Premium, Good, Fair]
Name: values, dtype: object

In [7]:
encoding = {'Premium':1, 'Very Good':2, 'Fair':3, 'Good':4, 'Ideal':5}
def ordinal_encoding(x):
    for key in encoding:
        if x == key:
            return encoding[key]

diamonds_encoded['cut_num'] = diamonds_encoded['cut'].apply(ordinal_encoding)
diamonds['cut'] = diamonds_encoded['cut'].apply(ordinal_encoding)

#diamonds_encoded
diamonds

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,2,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,5,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,1,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,2,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,2,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,5,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,5,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,5,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,2,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [8]:

encoding = {'VS2':1, 'VS1':2, 'SI1':3, 'SI2':4, 'IF':5, 'VVS1':6, 'VVS2':7, 'I1':8}
def ordinal_encoding(x):
    for key in encoding:
        if x == key:
            return encoding[key]

diamonds_encoded['clarity_num'] = diamonds_encoded['clarity'].apply(ordinal_encoding)
diamonds['clarity'] = diamonds_encoded['clarity'].apply(ordinal_encoding)

#diamonds_encoded
diamonds

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,2,F,3,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,5,J,2,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,1,H,3,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,2,F,3,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,2,F,2,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,5,E,3,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,5,I,1,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,5,F,2,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,2,F,4,58.8,57.0,5.85,5.89,3.45,Surat


In [9]:

encoding = {'J':1, 'H':2, 'G':3, 'D':4, 'F':5, 'E':6, 'I':7}
def ordinal_encoding(x):
    for key in encoding:
        if x == key:
            return encoding[key]

diamonds_encoded['color_num'] = diamonds_encoded['color'].apply(ordinal_encoding)
diamonds['color'] = diamonds_encoded['color'].apply(ordinal_encoding)

#diamonds_encoded
diamonds


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,2,5,3,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,5,1,2,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,1,2,3,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,2,5,3,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,2,5,2,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,5,6,3,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,5,7,1,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,5,5,2,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,2,5,4,58.8,57.0,5.85,5.89,3.45,Surat


In [10]:
# I want to know the name of the cities of the colum "city".
# Quiero saber las ciudades existentes en la columna "city".

diamonds['city'].unique()

array(['Amsterdam', 'Surat', 'Kimberly', 'Paris', 'Tel Aviv', 'Antwerp',
       'Madrid', 'Dubai', 'New York City', 'Las Vegas', 'London',
       'Luxembourg', 'Zurich'], dtype=object)

In [11]:

encoding = {'Dubai':1, 'Kimberly':1, 'Las Vegas':1, 'Tel Aviv':1, 'Amsterdam':1,
       'Zurich':1, 'Antwerp':1, 'Madrid':1, 'Paris':1, 'Surat':1, 'Luxembourg':1,
       'London':1, 'New York City':1}
def ordinal_encoding(x):
    for key in encoding:
        if x == key:
            return encoding[key]

diamonds_encoded['city_num'] = diamonds_encoded['city'].apply(ordinal_encoding)
diamonds['city'] = diamonds_encoded['city'].apply(ordinal_encoding)

#diamonds_encoded
diamonds


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,2,5,3,62.7,60.0,5.82,5.89,3.67,1
1,1,1.20,5,1,2,61.0,57.0,6.81,6.89,4.18,1
2,2,1.57,1,2,3,62.2,61.0,7.38,7.32,4.57,1
3,3,0.90,2,5,3,63.8,54.0,6.09,6.13,3.90,1
4,4,0.50,2,5,2,62.9,58.0,5.05,5.09,3.19,1
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,5,6,3,61.9,56.0,5.35,5.32,3.30,1
13481,13481,0.71,5,7,1,62.2,55.0,5.71,5.73,3.56,1
13482,13482,0.70,5,5,2,61.6,55.0,5.75,5.71,3.53,1
13483,13483,0.70,2,5,4,58.8,57.0,5.85,5.89,3.45,1


In [12]:
diamonds_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   city         13485 non-null  object
 1   color        13485 non-null  object
 2   cut          13485 non-null  object
 3   clarity      13485 non-null  object
 4   cut_num      13485 non-null  int64 
 5   clarity_num  13485 non-null  int64 
 6   color_num    13485 non-null  int64 
 7   city_num     13485 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 842.9+ KB


In [13]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       13485 non-null  int64  
 1   carat    13485 non-null  float64
 2   cut      13485 non-null  int64  
 3   color    13485 non-null  int64  
 4   clarity  13485 non-null  int64  
 5   depth    13485 non-null  float64
 6   table    13485 non-null  float64
 7   x        13485 non-null  float64
 8   y        13485 non-null  float64
 9   z        13485 non-null  float64
 10  city     13485 non-null  int64  
dtypes: float64(6), int64(5)
memory usage: 1.1 MB


In [14]:
# I verify that the dataset has the new data saved with the label encoding.
# Verifico que el dataset este con los nuevos datos guardados con el label encoding.

diamonds

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,2,5,3,62.7,60.0,5.82,5.89,3.67,1
1,1,1.20,5,1,2,61.0,57.0,6.81,6.89,4.18,1
2,2,1.57,1,2,3,62.2,61.0,7.38,7.32,4.57,1
3,3,0.90,2,5,3,63.8,54.0,6.09,6.13,3.90,1
4,4,0.50,2,5,2,62.9,58.0,5.05,5.09,3.19,1
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,5,6,3,61.9,56.0,5.35,5.32,3.30,1
13481,13481,0.71,5,7,1,62.2,55.0,5.71,5.73,3.56,1
13482,13482,0.70,5,5,2,61.6,55.0,5.75,5.71,3.53,1
13483,13483,0.70,2,5,4,58.8,57.0,5.85,5.89,3.45,1


In [15]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       13485 non-null  int64  
 1   carat    13485 non-null  float64
 2   cut      13485 non-null  int64  
 3   color    13485 non-null  int64  
 4   clarity  13485 non-null  int64  
 5   depth    13485 non-null  float64
 6   table    13485 non-null  float64
 7   x        13485 non-null  float64
 8   y        13485 non-null  float64
 9   z        13485 non-null  float64
 10  city     13485 non-null  int64  
dtypes: float64(6), int64(5)
memory usage: 1.1 MB


In [16]:
# Features + target
'''
X = diamonds[['carat',
              'city',
              'color',
              'cut',
              'clarity',
              'depth',
              'table',
              'x',
              'y',
              'z']]
#y =diamonds['price']
print(X.shape,y.shape)
'''

"\nX = diamonds[['carat',\n              'city',\n              'color',\n              'cut',\n              'clarity',\n              'depth',\n              'table',\n              'x',\n              'y',\n              'z']]\n#y =diamonds['price']\nprint(X.shape,y.shape)\n"

In [17]:
# I open the saved trained model.
# Abro el modelo entrenado guardado.

filename = 'trained_model3.sav'

loaded_model = pickle.load(open(filename, 'rb'))



In [18]:
diamonds.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'city'],
      dtype='object')

In [19]:
diamonds_df = diamonds[['carat', 'city', 'color', 'cut', 'clarity', 'depth', 'table', 'x', 'y',
       'z']]

In [20]:
# I do the prediction. And it returns me an array.
# Hago la predicción. Y me devuelve un array.

# Model predictions.
# Modelo de predicción

predictions = loaded_model.predict(diamonds_df)
print(type(predictions))


<class 'numpy.ndarray'>


In [21]:
pd.DataFrame(predictions)

Unnamed: 0,0
0,3020.39
1,5371.75
2,8965.40
3,4150.03
4,1709.73
...,...
13480,1892.02
13481,2591.79
13482,2924.94
13483,2224.25


In [22]:
# I name the column that I create from the arra with the same name as the target. "price"
# Nombro a la columna que creo del arra con el mismo nombre que el target. "price"

submission_3 = pd.DataFrame(predictions,columns=['price'])

In [23]:
submission_3

Unnamed: 0,price
0,3020.39
1,5371.75
2,8965.40
3,4150.03
4,1709.73
...,...
13480,1892.02
13481,2591.79
13482,2924.94
13483,2224.25


In [24]:
# I create an id column, necessary to be able to compare with "sample_submision.csv"
# Creo una columna id, necesaria para poder comparar con "sample_submision.csv"

submission_3=submission_3.reset_index()
column_nm=['id', 'price']
submission_3.columns =column_nm
submission_3

Unnamed: 0,id,price
0,0,3020.39
1,1,5371.75
2,2,8965.40
3,3,4150.03
4,4,1709.73
...,...,...
13480,13480,1892.02
13481,13481,2591.79
13482,13482,2924.94
13483,13483,2224.25


In [25]:
# I save the dataFrame of the array in a csv.
# Guardo el dataFrame del array en un csv.

submission_3.to_csv('./data/submission_3.csv', index=False)