In [1]:
# libraries
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# import the data
jems = pd.read_csv("diamonds.csv")

In [3]:
# Explore the DataFrame
jems.head()


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
# How many rows and columns?
jems.shape

(53940, 10)

In [5]:
# What are the column names?
jems.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [6]:
# Show a quick summary statistics of the numerical variables we have
jems.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [7]:
# what are the types of values in each column?
jems.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [8]:
# show the last 10 rows of the dataset
jems.tail(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
53930,0.71,Premium,E,SI1,60.5,55.0,2756,5.79,5.74,3.49
53931,0.71,Premium,F,SI1,59.8,62.0,2756,5.74,5.73,3.43
53932,0.7,Very Good,E,VS2,60.5,59.0,2757,5.71,5.76,3.47
53933,0.7,Very Good,E,VS2,61.2,59.0,2757,5.69,5.72,3.49
53934,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.5
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.7,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74
53939,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64


In [9]:
# show a random 5 observations of the dataset
jems.sample(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1702,0.78,Ideal,D,SI1,62.4,57.0,3035,5.87,5.89,3.67
42425,0.51,Very Good,F,SI1,63.1,54.0,1312,5.12,5.06,3.21
20524,1.52,Premium,I,SI1,58.1,62.0,8852,7.61,7.56,4.41
48565,0.5,Good,G,VVS2,64.4,57.0,1991,4.97,5.05,3.22
44842,0.56,Ideal,H,VS2,61.3,56.0,1625,5.32,5.27,3.25


In [10]:
# How many diamonds with a clarity of category “IF” are present in the data-set?
jems['clarity'].value_counts().IF

1790

In [11]:
# What fraction of the total do they represent?
jems['clarity'].value_counts().IF/jems['clarity'].count()

0.03318502039302929

In [12]:
# What is the cheapest diamond price overall?
jems['price'].min()

326

In [13]:
# What is the range of diamond prices?
(min(jems.price), max(jems.price))

(326, 18823)

In [14]:
# What is the average diamond price in each category of cut and color?
jems.groupby(['cut',"color"]).price.agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean
cut,color,Unnamed: 2_level_1
Fair,D,4291.06135
Fair,E,3682.3125
Fair,F,3827.003205
Fair,G,4239.254777
Fair,H,5135.683168
Fair,I,4685.445714
Fair,J,4975.655462
Good,D,3405.382175
Good,E,3423.644159
Good,F,3495.750275
