In [35]:
import pandas as pd

df = pd.read_csv("diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [45]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [46]:
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}

In [47]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

FL,IF, VVS1, VVS2, VS1, VS2, SI1, SI2, I1, I2, I3 - Taken from the dataset page, this is ordered best to worst, so now we need this in a dict too.

We also have color. D is the best, J is the worst.

In [5]:
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

In [48]:
df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [7]:
import sklearn
from sklearn.linear_model import SGDRegressor

df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.

X = df.drop("price", axis=1).values
y = df["price"].values

In [8]:
test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

In [9]:
clf = SGDRegressor(max_iter=1000)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))

-169716966.18255037


In [10]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

22444137.136467457 3365
-43443171.014743805 842
27170870.383862972 5600
30423105.71565962 6468
-45202250.546970844 3360
-57564682.934447765 876
-78040747.2394104 4633
-45878871.67086029 3873
1726116.0675020218 1755
-62319117.010454655 456


In [11]:
from sklearn import svm

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

-0.09673774039928018


In [12]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

2391.8871714277357 3365
2287.721993172211 842
2465.0369637934473 5600
2455.5107598729282 6468
2482.5199710167467 3360
2364.1066446235336 876
2449.7215837677586 4633
2440.2195393614793 3873
2403.696173119835 1755
2334.406910211517 456


In [13]:
clf = SGDRegressor(max_iter=10000)

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

-11266095.012137182
-28100216.207829475 3365
-6752871.735469699 842
-29022496.428314924 5600
-30273679.579220533 6468
-8136124.630202532 3360
-2888225.139208436 876
1858105.8247030973 4633
-6508417.491261959 3873
-20259675.351282597 1755
-753576.3754941225 456


improving models and scale data

In [14]:
import sklearn
from sklearn import svm, preprocessing

df = sklearn.utils.shuffle(df) 

X = df.drop("price", axis=1).values
X = preprocessing.scale(X)
y = df["price"].values

test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(f"model predicts {clf.predict([X])[0]}, real value: {y}")

0.5689918450445994
model predicts 1092.5823243952013, real value: 612
model predicts 6704.744710626697, real value: 18656
model predicts 2940.98116841229, real value: 2398
model predicts 5271.756220864841, real value: 7339
model predicts 6356.498143421156, real value: 15600
model predicts 715.4851809094562, real value: 872
model predicts 708.4884214473241, real value: 945
model predicts 6432.3572834037395, real value: 13622
model predicts 4453.600641313859, real value: 4315
model predicts 4154.437850063783, real value: 14208


In [15]:
def prediction_function_wrapper(X_test):
  return clf.predict(X_test)

In [18]:
from giskard import GiskardClient

url = "http://34.83.122.220:19000" #if Giskard is installed locally (for installation, see: https://docs.giskard.ai/start/guides/installation) 
token = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsInRva2VuX3R5cGUiOiJBUEkiLCJhdXRoIjoiUk9MRV9BRE1JTiIsImV4cCI6MTY4ODQ3NzU0Mn0.io9Wdxf5JTyGUUOIhXt69_w0wf6OXYiR8BIGgu2DZuA" #you can generate your API token in the Admin tab of the Giskard application (for installation, see: https://docs.giskard.ai/start/guides/installation) 
client = GiskardClient(url, token)

project = client.create_project("diamonds", "REGRESSION", "DESCRIPTION") #Choose the arguments you want. But "project_key" should be unique and in lower case
#If your project is already created use project = client.get_project("existing_project_key")

In [49]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [50]:
column_types={'carat': 'numeric', 'cut':'numeric', 'color':'numeric', 'clarity':'numeric', 'depth':'numeric',
              'table': 'numeric', 'x':'numeric','price':'numeric', 'y':'numeric', 'z':'numeric'}

In [54]:
# feature_types is used to declare the features the model is trained on
feature_types = {i:column_types[i] for i in column_types if i!='price'}

In [53]:
print(isinstance(df, pd.DataFrame))

True


In [55]:
df.head(1)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43


In [57]:
project.upload_model_and_df(
    prediction_function=prediction_function_wrapper, # Python function which takes pandas dataframe as input and returns probabilities for classification model OR returns predictions for regression model
    model_type='regression', # "classification" for classification model OR "regression" for regression model
    df=df, # The dataset you want to use to inspect your model
    column_types=column_types, # A dictionary with columns names of df as key and types(category, numeric, text) of columns as values
    target='price', # The column name in df corresponding to the actual target variable (ground truth).
    feature_names=list(feature_types.keys()), # List of the feature names of prediction_function
    model_name='clf', # Name of the model
    dataset_name='diamonds.csv' # Name of the dataset
)



Dataset successfully uploaded to project key 'diamonds' with ID = 33. It is available at http://34.83.122.220:19000 
Model successfully uploaded to project key 'diamonds' with ID = 34. It is available at http://34.83.122.220:19000 


(34, 33)