In [34]:
import pandas as pd 

In [35]:
df = pd.read_csv("/home/marwane/mlops-projects/first-end-to-end-mlops-project/data/playground-series-s3e8/train.csv")


### Dataset Description

The dataset for this competition (both train and test) was generated from a deep learning model trained on the Gemstone Price Prediction dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.

### Files

    train.csv - the training dataset; price is the target
    test.csv - the test dataset; your objective is to predict price
    sample_submission.csv - a sample submission file in the correct format

### Columns:

There are 10 independant variables (including id)

* id: unique identifier of each diamond

* carat: Carat(ct) refers to the unique of weight  exclusively to weigh gemstones and Diamonds

* cut: Quality of Diamond Cut

* Color: Color od Diamond

* clarity: Diamond clarity is a mesure .. clarity of the stone, graded by the visibility of these chrateristics under 10-power magnification.

* depth: The depth of Diamond is its heights (in millimeter) from the culet(bottom tip) to the table (flat, top surface)

* table: A Diamond's table is the facet which can be seen when the stone is viewed face up.

* x: Diamond X dimension

* y: Diamond Y dimension

* z: Diamond Z dimension

### Target:
price: Price of the given Diamond

In [36]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [37]:
df["cut"].value_counts()

cut
Ideal        92454
Premium      49910
Very Good    37566
Good         11622
Fair          2021
Name: count, dtype: int64

In [38]:
df["color"].value_counts()

color
G    44391
E    35869
F    34258
H    30799
D    24286
I    17514
J     6456
Name: count, dtype: int64

In [39]:
df.sample(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
155127,155127,0.9,Very Good,E,SI1,62.8,58.0,6.12,6.15,3.85,4119
43626,43626,0.71,Ideal,E,SI2,61.2,57.0,5.75,5.79,3.53,2298
101195,101195,0.4,Ideal,E,VS1,61.7,57.0,4.77,4.79,2.95,1125
33518,33518,2.02,Very Good,D,SI2,62.6,56.0,8.05,8.0,5.04,17953
32212,32212,0.42,Good,F,SI1,64.0,57.0,4.75,4.72,3.03,926
27884,27884,0.32,Ideal,F,IF,61.6,55.0,4.4,4.45,2.73,944
179381,179381,0.38,Premium,G,VS1,62.2,56.0,4.64,4.62,2.88,833
110601,110601,0.91,Very Good,I,VVS2,63.4,58.0,6.15,6.13,3.9,4256
52076,52076,0.77,Ideal,E,VS2,61.3,56.0,5.94,5.9,3.63,3544
158808,158808,1.0,Good,F,VS2,63.3,58.0,6.31,6.38,4.02,6250


In [40]:
df.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [41]:
df.shape

(193573, 11)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 11 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       193573 non-null  int64  
 1   carat    193573 non-null  float64
 2   cut      193573 non-null  object 
 3   color    193573 non-null  object 
 4   clarity  193573 non-null  object 
 5   depth    193573 non-null  float64
 6   table    193573 non-null  float64
 7   x        193573 non-null  float64
 8   y        193573 non-null  float64
 9   z        193573 non-null  float64
 10  price    193573 non-null  int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 16.2+ MB


In [43]:
df.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'price'],
      dtype='object')

In [44]:
df.drop(["id"], axis=1, inplace=True)

In [45]:
df.duplicated().sum()

0

In [47]:
# categorical features
cat_columns = df.columns[df.dtypes=="object"]

In [48]:
num_columns = df.columns[df.dtypes!="object"]

In [50]:
df[num_columns].info

<bound method DataFrame.info of         carat  depth  table     x     y     z  price
0        1.52   62.2   58.0  7.27  7.33  4.55  13619
1        2.03   62.0   58.0  8.06  8.12  5.05  13387
2        0.70   61.2   57.0  5.69  5.73  3.50   2772
3        0.32   61.6   56.0  4.38  4.41  2.71    666
4        1.70   62.6   59.0  7.65  7.61  4.77  14453
...       ...    ...    ...   ...   ...   ...    ...
193568   0.31   61.1   56.0  4.35  4.39  2.67   1130
193569   0.70   60.3   58.0  5.75  5.77  3.47   2874
193570   0.73   63.1   57.0  5.72  5.75  3.62   3036
193571   0.34   62.9   55.0  4.45  4.49  2.81    681
193572   0.71   60.8   64.0  5.73  5.71  3.48   2258

[193573 rows x 7 columns]>

In [51]:
df[cat_columns]#.info

Unnamed: 0,cut,color,clarity
0,Premium,F,VS2
1,Very Good,J,SI2
2,Ideal,G,VS1
3,Ideal,G,VS1
4,Premium,G,VS2
...,...,...,...
193568,Ideal,D,VVS2
193569,Premium,G,VVS2
193570,Very Good,F,SI1
193571,Very Good,D,SI1
