In [None]:
! pip install matplotlib  
! pip install scikit-learn

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('datasets/diamonds.csv', index_col=0)
data.head()

In [None]:
data.shape

In [None]:
# this is a large dataset, so we will take a sample of 20% of the data
data = data.sample(frac=0.2)
data.shape

In [None]:
# reset panda index
data.reset_index(drop=True, inplace=True)
data.head()

In [None]:
# remove the categorical variables
data_numeric = data.select_dtypes(exclude=['object'])

# Compute the correlation matrix
correlation = data_numeric.corr()
correlation

In [None]:
plt.subplots(figsize=(10,8))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.show()

In [None]:
data['cut'].unique()


In [None]:
data['color'].unique()

In [None]:
data['clarity'].unique()

In [None]:
data.boxplot(column='price', by='cut', figsize=(10,8))

In [None]:
# use boxplot to compare one categorical variable and one numerical variable
data.boxplot(column='price', by='color', figsize=(10,8))

In [None]:
# use scatter plot to visualize the relationship between two numerical variables
plt.subplots(figsize=(10,8))
plt.scatter(data['carat'], data['price'])
plt.xlabel('Carat')
plt.ylabel('Price')
plt.show()

In [None]:
# use a dictionary to map the categorical variables to numerical variables if order matters
clarity_dict = {'I1': 0, 'SI2': 1, 'SI1': 2, 'VS2': 3, 'VS1': 4, 'VVS2': 5, 'VVS1': 6, 'IF': 7}
data['clarity'] = data['clarity'].map(clarity_dict)
data.head()

In [None]:
#if order does not matter, use get_dummies to convert the categorical variables to numerical variables
one_hot_encoded_cut = pd.get_dummies(data['cut'])
one_hot_encoded_color = pd.get_dummies(data['color'])
one_hot_encoded_cut = one_hot_encoded_cut.astype(int)
one_hot_encoded_color = one_hot_encoded_color.astype(int)
# remove categorical variables and add the one-hot encoded variables
data = data.drop(['cut', 'color'], axis=1)
data = pd.concat([data, one_hot_encoded_cut, one_hot_encoded_color], axis=1)
data.head()

In [None]:
NUMERIC_FEATURES = ['carat', 'depth', 'table', 'x', 'y', 'z']
numeric_df = data[NUMERIC_FEATURES]
numeric_df.describe()

In [None]:
# scale the numerical features
from sklearn.preprocessing import scale
numeric_array = scale(numeric_df)
numeric_array[:5]

In [None]:
numeric_df = pd.DataFrame(numeric_array, columns=numeric_df.columns)
numeric_df.describe()

In [None]:
data.drop(NUMERIC_FEATURES, axis=1, inplace=True)
data = pd.concat([data, numeric_df], axis=1)
data.head()

In [None]:
data.to_csv('datasets/diamonds_processed.csv', index=False)

In [None]:
! dir datasets

In [None]:
from sklearn.model_selection import train_test_split
X = data.drop('price', axis=1)
Y = data['price']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression()
linear_regression.fit(x_train, y_train)

In [None]:
y_prediction = linear_regression.predict(x_test)
y_prediction[:5]

In [None]:
train_score = linear_regression.score(x_train, y_train)
print('train score:', train_score)

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test, y_prediction)
print('test score:', score)