In [150]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [106]:
df = pd.read_csv("./diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [107]:
df.shape

(53940, 10)

In [108]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [109]:
df["clarity"].nunique()

8

In [110]:
df["color"].nunique()

7

In [111]:
df = pd.concat([df.drop("color", axis=1), pd.get_dummies(df["color"], prefix="color")], axis=1)
df = pd.concat([df.drop("clarity", axis=1), pd.get_dummies(df["clarity"], prefix="clarity")], axis=1)

In [112]:
df.head()

Unnamed: 0,carat,cut,depth,table,price,x,y,z,color_D,color_E,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,Ideal,61.5,55.0,326,3.95,3.98,2.43,False,True,...,False,False,False,False,False,True,False,False,False,False
1,0.21,Premium,59.8,61.0,326,3.89,3.84,2.31,False,True,...,False,False,False,False,True,False,False,False,False,False
2,0.23,Good,56.9,65.0,327,4.05,4.07,2.31,False,True,...,False,False,False,False,False,False,True,False,False,False
3,0.29,Premium,62.4,58.0,334,4.2,4.23,2.63,False,False,...,True,False,False,False,False,False,False,True,False,False
4,0.31,Good,63.3,58.0,335,4.34,4.35,2.75,False,False,...,False,True,False,False,False,True,False,False,False,False


In [113]:
X = df.drop("cut", axis=1)
y = df["cut"]

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [115]:
pipe = Pipeline([
    ("forest", RandomForestClassifier())
])

In [116]:
pipe.fit(X_train, y_train)

In [117]:
pipe.score(X_test, y_test)

0.767797552836485

In [118]:
df2 = pd.read_csv("./diamonds.csv")
df2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [119]:
lb = LabelEncoder()

In [120]:
df2["color"] = lb.fit_transform(df2["color"])
df2["clarity"] = lb.fit_transform(df2["clarity"])
df2["cut"] = lb.fit_transform(df2["cut"])

In [121]:
df2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75


In [122]:
mms = MinMaxScaler()

In [123]:
df2["depth"] = mms.fit_transform(df2[["depth"]])
df2["table"] = mms.fit_transform(df2[["table"]])
df2["price"] = mms.fit_transform(df2[["price"]])
df2["x"] = mms.fit_transform(df2[["x"]])
df2["y"] = mms.fit_transform(df2[["y"]])
df2["z"] = mms.fit_transform(df2[["z"]])

In [124]:
df2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,0.513889,0.230769,0.0,0.367784,0.067572,0.076415
1,0.21,3,1,2,0.466667,0.346154,0.0,0.362197,0.065195,0.072642
2,0.23,1,1,4,0.386111,0.423077,5.4e-05,0.377095,0.0691,0.072642
3,0.29,3,5,5,0.538889,0.288462,0.000433,0.391061,0.071817,0.082704
4,0.31,1,6,3,0.563889,0.288462,0.000487,0.404097,0.073854,0.086478


In [131]:
X2 = df2.drop("cut", axis=1)
y2 = df2["cut"]

In [132]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [146]:
pipe2 = Pipeline([
    ("forest", RandomForestClassifier())
])

In [147]:
df2["cut"].unique()

array([2, 3, 1, 4, 0])

In [148]:
pipe2.fit(X_train2, y_train2)

In [149]:
pipe2.score(X_test2, y_test2)

0.7886540600667408

In [151]:
from sklearn.neural_network import MLPClassifier

mlp = Pipeline([
    ("impute", SimpleImputer()),
    ("scaler", StandardScaler()),
    ("MLP", MLPClassifier(hidden_layer_sizes=(150,100,50), max_iter=300,activation = 'relu',solver='adam',random_state=42))
])

In [152]:
mlp.fit(X_train2, y_train2)



In [153]:
mlp.score(X_test2, y_test2)

0.7800333704115684