In [1]:
# imports

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

In [12]:
diamonds_train = pd.read_csv('db/diamonds.csv')
diamonds_test = pd.read_csv('db/diamonds_test.csv')
sample_submission = pd.read_csv('db/sample_submission.csv')

In [29]:
diamonds_train

Unnamed: 0.1,Unnamed: 0,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,0,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,1,61.6,58.0,6.40,6.35,3.93,3513,1.02,Premium,J,VS2,Dubai
2,2,62.3,58.0,5.86,5.80,3.63,1792,0.77,Premium,J,VS2,Dubai
3,3,59.6,60.0,7.58,7.48,4.49,7553,1.51,Premium,J,VS2,Dubai
4,4,60.2,62.0,5.40,5.33,3.23,1176,0.57,Premium,J,VS2,Dubai
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,62.2,54.0,5.24,5.27,3.27,2729,0.54,Ideal,F,IF,Surat
40451,40451,61.9,54.0,5.22,5.25,3.24,2802,0.53,Ideal,F,IF,Surat
40452,40452,62.3,55.0,4.30,4.34,2.69,886,0.30,Ideal,F,IF,Surat
40453,40453,60.9,55.0,4.15,4.23,2.55,768,0.26,Ideal,F,IF,Surat


In [5]:
#to see which features are catagorical and numerical 
diamonds_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  40455 non-null  int64  
 1   depth       40455 non-null  float64
 2   table       40455 non-null  float64
 3   x           40455 non-null  float64
 4   y           40455 non-null  float64
 5   z           40455 non-null  float64
 6   price       40455 non-null  int64  
 7   carat       40455 non-null  float64
 8   cut         40455 non-null  object 
 9   color       40455 non-null  object 
 10  clarity     40455 non-null  object 
 11  city        40455 non-null  object 
dtypes: float64(6), int64(2), object(4)
memory usage: 3.7+ MB


In [8]:
# categorical features
cols = ['cut', 'color', 'clarity', 'city']
cat_list = []
for col in cols:
    cat = diamonds_train[col].unique()
    cat_num = len(cat)
    cat_dict = {"categorical_variable":col,
                "number_of_possible_values":cat_num,
                "values":cat}
    cat_list.append(cat_dict)
    
categories = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values",
                                                ascending=False).reset_index(drop=True)
categories

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,city,13,"[Dubai, Luxembourg, New York City, Antwerp, Ma..."
1,clarity,8,"[VS2, VVS2, SI1, VS1, SI2, I1, VVS1, IF]"
2,color,7,"[J, E, I, G, D, H, F]"
3,cut,5,"[Premium, Very Good, Fair, Good, Ideal]"


In [16]:
target = "price"
cat_features = ['cut','color','clarity','city']
num_features = ['carat','depth','table','x','y','z']

for cat in cat_features:
    diamonds_train[cat_features]=diamonds_train[cat_features].astype('category')
    diamonds_test[cat_features]=diamonds_test[cat_features].astype('category')
    
cat_df = pd.get_dummies(diamonds_train[cat_features])
num_df = diamonds_train.loc[:,num_features]
train_df = pd.concat([cat_df,num_df],axis=1)

cat_df = pd.get_dummies(diamonds_test[cat_features])
num_df = diamonds_test.loc[:,num_features]
test_df = pd.concat([cat_df, num_df], axis=1) 

features = list(cat_df.columns) + list(num_df.columns)

In [19]:
train_df

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,...,city_Paris,city_Surat,city_Tel Aviv,city_Zurich,carat,depth,table,x,y,z
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1.21,62.4,58.0,6.83,6.79,4.25
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1.02,61.6,58.0,6.40,6.35,3.93
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.77,62.3,58.0,5.86,5.80,3.63
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1.51,59.6,60.0,7.58,7.48,4.49
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.57,60.2,62.0,5.40,5.33,3.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,0,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0.54,62.2,54.0,5.24,5.27,3.27
40451,0,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0.53,61.9,54.0,5.22,5.25,3.24
40452,0,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0.30,62.3,55.0,4.30,4.34,2.69
40453,0,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0.26,60.9,55.0,4.15,4.23,2.55


In [24]:
n = len(features)
n

39

In [33]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(train_df.loc[:,features].values)
scaled_data_df = pd.DataFrame(scaled_data)
scaled_data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
0,-0.176113,-0.315531,-0.818095,1.715512,-0.538534,-0.377628,-0.470211,-0.464401,-0.514022,-0.428551,...,-0.258284,-0.378653,-0.25692,-0.256482,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657
1,-0.176113,-0.315531,-0.818095,1.715512,-0.538534,-0.377628,-0.470211,-0.464401,-0.514022,-0.428551,...,-0.258284,-0.378653,-0.25692,-0.256482,0.467458,-0.106755,0.247981,0.596394,0.538254,0.563582
2,-0.176113,-0.315531,-0.818095,1.715512,-0.538534,-0.377628,-0.470211,-0.464401,-0.514022,-0.428551,...,-0.258284,-0.378653,-0.25692,-0.256482,-0.058262,0.382172,0.247981,0.116154,0.058590,0.133198
3,-0.176113,-0.315531,-0.818095,1.715512,-0.538534,-0.377628,-0.470211,-0.464401,-0.514022,-0.428551,...,-0.258284,-0.378653,-0.25692,-0.256482,1.497870,-1.503688,1.143433,1.645806,1.523746,1.366964
4,-0.176113,-0.315531,-0.818095,1.715512,-0.538534,-0.377628,-0.470211,-0.464401,-0.514022,-0.428551,...,-0.258284,-0.378653,-0.25692,-0.256482,-0.478838,-1.084608,2.038886,-0.292939,-0.351305,-0.440646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,-0.176113,-0.315531,1.222351,-0.582916,-0.538534,-0.377628,-0.470211,2.153313,-0.514022,-0.428551,...,-0.258284,2.640941,-0.25692,-0.256482,-0.541924,0.312325,-1.542924,-0.435232,-0.403632,-0.383262
40451,-0.176113,-0.315531,1.222351,-0.582916,-0.538534,-0.377628,-0.470211,2.153313,-0.514022,-0.428551,...,-0.258284,2.640941,-0.25692,-0.256482,-0.562953,0.102785,-1.542924,-0.453019,-0.421074,-0.426300
40452,-0.176113,-0.315531,1.222351,-0.582916,-0.538534,-0.377628,-0.470211,2.153313,-0.514022,-0.428551,...,-0.258284,2.640941,-0.25692,-0.256482,-1.046615,0.382172,-1.095198,-1.271205,-1.214700,-1.215336
40453,-0.176113,-0.315531,1.222351,-0.582916,-0.538534,-0.377628,-0.470211,2.153313,-0.514022,-0.428551,...,-0.258284,2.640941,-0.25692,-0.256482,-1.130730,-0.595681,-1.095198,-1.404604,-1.310633,-1.416182


In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_data_df,diamonds_train['price'], test_size=0.2, random_state=42)

In [43]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
32121,-0.176113,-0.315531,-0.818095,1.715512,-0.538534,-0.377628,2.126703,-0.464401,-0.514022,-0.428551,...,-0.258284,-0.378653,-0.256920,-0.256482,-0.794270,-1.154455,0.247981,-0.728712,-0.752478,-0.856683
9831,-0.176113,-0.315531,-0.818095,1.715512,-0.538534,-0.377628,-0.470211,-0.464401,1.945442,-0.428551,...,-0.258284,-0.378653,-0.256920,-0.256482,2.149762,0.661559,0.247981,1.796993,1.698169,1.869078
33128,-0.176113,-0.315531,-0.818095,1.715512,-0.538534,2.648108,-0.470211,-0.464401,-0.514022,-0.428551,...,-0.258284,-0.378653,-0.256920,-0.256482,-0.626039,0.801252,1.591160,-0.621992,-0.673988,-0.569761
6199,-0.176113,-0.315531,-0.818095,1.715512,-0.538534,-0.377628,-0.470211,-0.464401,1.945442,-0.428551,...,-0.258284,-0.378653,-0.256920,-0.256482,-0.184434,0.172632,1.143433,-0.043926,-0.080949,-0.038955
19661,-0.176113,3.169263,-0.818095,-0.582916,-0.538534,-0.377628,2.126703,-0.464401,-0.514022,-0.428551,...,-0.258284,-0.378653,-0.256920,-0.256482,0.446430,-1.713228,2.934338,0.658647,0.660350,0.434467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,-0.176113,-0.315531,1.222351,-0.582916,-0.538534,-0.377628,-0.470211,-0.464401,1.945442,-0.428551,...,-0.258284,-0.378653,-0.256920,-0.256482,1.855359,-0.106755,-1.095198,1.743633,1.663284,1.696925
11284,-0.176113,-0.315531,-0.818095,-0.582916,1.856892,-0.377628,-0.470211,2.153313,-0.514022,-0.428551,...,-0.258284,-0.378653,-0.256920,-0.256482,0.425401,-1.363994,0.247981,0.676434,0.695235,0.506197
38158,-0.176113,-0.315531,1.222351,-0.582916,-0.538534,-0.377628,-0.470211,2.153313,-0.514022,-0.428551,...,-0.258284,2.640941,-0.256920,-0.256482,-0.773241,-0.316295,-0.647472,-0.773178,-0.726315,-0.784953
860,-0.176113,-0.315531,-0.818095,-0.582916,1.856892,-0.377628,2.126703,-0.464401,-0.514022,-0.428551,...,-0.258284,-0.378653,-0.256920,-0.256482,0.046882,1.010792,-0.199745,0.169514,0.206850,0.319698


In [37]:
from sklearn import linear_model
model = linear_model.Lasso()
model.fit(X_train, y_train)

Lasso()

In [38]:
predictions = model.predict(X_test)

Int64Index([17775, 13506,  4325, 37870, 21321, 18798, 22799, 17668, 31017,
            24809,
            ...
            14163, 14934, 25699, 36753,  6016,  3781, 26959, 15529, 36333,
            31606],
           dtype='int64', length=8091)

In [51]:
diamond_ID = pd.DataFrame(X_test.index)

In [52]:
prediction_df = pd.DataFrame(predictions) 

In [53]:
submission = pd.concat([diamond_ID, prediction_df], axis=1)