### Diamonds dataset exploration (cont'd)

In [1]:
#Importing the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, make_scorer 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model

In [2]:
!kaggle competitions download -c diamonds-datamad1020-rev

diamonds-datamad1020-rev.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
!tar -xzvf diamonds-datamad1020-rev.zip

x sample_submission.csv
x test.csv
x train.csv


In [4]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74
1,1,0.41,Ideal,E,VS2,61.8,54.0,4.79,4.76,2.95
2,2,0.91,Very Good,E,SI2,62.5,59.0,6.16,6.23,3.87
3,3,0.42,Very Good,G,VS2,62.6,57.0,4.76,4.8,2.99
4,4,0.54,Ideal,G,IF,61.5,56.0,5.28,5.25,3.24


In [5]:
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.53,Very Good,G,SI1,63.4,54.0,5.09,5.13,3.24,7.057
1,1,0.41,Ideal,D,SI1,63.0,56.0,4.8,4.75,3.01,6.824
2,2,0.32,Ideal,I,VS2,61.6,56.0,4.37,4.39,2.7,6.107
3,3,0.31,Ideal,H,VVS2,61.2,56.0,4.34,4.37,2.66,6.39
4,4,1.35,Premium,J,VS2,60.5,56.0,7.19,7.12,4.33,8.741


In [6]:
#Let's check for collinearity issues
#Created a code to evaluate the collinearity between columns
corr_matrix = train_data.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
to_drop

['x', 'y', 'z', 'price']

In [7]:
#Let's drop x, y, z columns. 
train_data.drop(['x','y','z'], axis = 1, inplace = True)
test_data.drop(['x','y','z'], axis = 1, inplace = True)

In [8]:
#Now, we will analyze numerical and categorical data - train data
numerical_cols = train_data.select_dtypes(include=np.number).columns.to_list()
categorical_cols = train_data.select_dtypes(exclude=np.number).columns.to_list()

In [9]:
#Now, we will analyze numerical and categorical data - test data
numerical_cols = test_data.select_dtypes(include=np.number).columns.to_list()
categorical_cols = test_data.select_dtypes(exclude=np.number).columns.to_list()

In [10]:
#Label encoding as performed in the 'Exploring Diamonds Dataset notebook'
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 

train_data['cut'] = label_encoder.fit_transform(train_data['cut'])
train_data['color'] = label_encoder.fit_transform(train_data['color'])
train_data['clarity'] = label_encoder.fit_transform(train_data['clarity'])
train_data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price
0,0,0.53,4,3,2,63.4,54.0,7.057
1,1,0.41,2,0,2,63.0,56.0,6.824
2,2,0.32,2,5,5,61.6,56.0,6.107
3,3,0.31,2,4,7,61.2,56.0,6.39
4,4,1.35,3,6,5,60.5,56.0,8.741


In [11]:
test_data['cut'] = label_encoder.fit_transform(test_data['cut'])
test_data['color'] = label_encoder.fit_transform(test_data['color'])
test_data['clarity'] = label_encoder.fit_transform(test_data['clarity'])
test_data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table
0,0,0.33,2,4,1,61.9,55.0
1,1,0.41,2,1,5,61.8,54.0
2,2,0.91,4,1,3,62.5,59.0
3,3,0.42,4,3,5,62.6,57.0
4,4,0.54,2,3,1,61.5,56.0


In [12]:
X = train_data.drop(['price'],1)
y = train_data['price']

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

### KNeighbors Regression

In [43]:
score = []
for k in range(1,20):   # running for different K values to know which yields the max accuracy. 
    model = KNeighborsRegressor(n_neighbors = k,  weights = 'distance', p=1)
    model.fit(X_train, y_train)
    score.append(model.score(X_test, y_test ))    

In [48]:
#Calculating max score 
k_max = score.index(max(score))+1
print( "At K = {}, Max Accuracy = {}".format(k_max, max(score)*100))

At K = 19, Max Accuracy = 1.6939594253171375


In [45]:
model_max = KNeighborsRegressor(n_neighbors = k_max,  weights = 'distance', p=1)
model_max.fit(X_train, y_train)
print(model_max.score(X_test, y_test ))   

0.016939594253171375


In [46]:
y_pred = model_max.predict(X_test)

In [47]:
mse = mean_squared_error(pred,y_test)
rmse = np.sqrt(mean_squared_error(pred,y_test))
mse,rmse

(1.125726366620252, 1.0610025290357474)

### ---> Tuning Parameters - KNeighbors Regression + Grid Search CV

In [17]:
ptmodel = KNeighborsRegressor()
ptrange = np.arange(1,11)
ptparams = ({'n_neighbors': ptrange})
gridsearch = GridSearchCV(ptmodel,ptparams,cv=10)

In [18]:
gridsearch.fit(X_train,y_train)
gridsearch.best_params_

{'n_neighbors': 10}

In [19]:
model = KNeighborsRegressor(n_neighbors=10)
model.fit(X_train,y_train)

KNeighborsRegressor(n_neighbors=10)

In [20]:
pred = model.predict(X_test)
mse = mean_squared_error(pred,y_test)
rmse = np.sqrt(mean_squared_error(pred,y_test))
mse

1.125726366620252

In [21]:
model.score(X_train,y_train)

0.11517672088606279

### Submitting Score to Kaggle Competition (Prediction 2 - KNN Neighbors)

In [22]:
prediction2 = model.predict(test_data)

In [23]:
len(prediction2)

13485

In [24]:
KNN_df = pd.DataFrame(prediction2, columns = ['price'])

In [25]:
KNN_df['id'] = KNN_df.index

In [26]:
KNN_df = KNN_df[['id', 'price']]

In [27]:
KNN_df.to_csv(r'KNN_df.csv', index=False)

### Lasso and Ridge Regression

In [28]:
lr=LinearRegression()
lasso = linear_model.Lasso(alpha=0.1)
ridge = linear_model.Lasso(alpha=0.1)

In [29]:
lr.fit(X_train,y_train)
lasso.fit(X_train,y_train)
ridge.fit(X_train,y_train)

Lasso(alpha=0.1)

In [30]:
#Printing scores
print("Linear Regression:",lr.score(X_test,y_test))
print("Lasso Regression:",lasso.score(X_test,y_test))
print("Ridge Regression:",ridge.score(X_test,y_test))

Linear Regression: 0.8668308254311459
Lasso Regression: 0.8042314709085665
Ridge Regression: 0.8042314709085665


In [31]:
lasso.score(X_train,y_train)

0.8048189694526161

In [32]:
ridge.score(X_train,y_train)

0.8048189694526161

### Submitting Score to Kaggle Competition (Prediction 3 - Lasso Regression)

In [33]:
prediction3 = lasso.predict(test_data)

In [34]:
Lasso_df = pd.DataFrame(prediction3, columns = ['price'])

In [35]:
Lasso_df['id'] = Lasso_df.index

In [36]:
Lasso_df = Lasso_df[['id', 'price']]

In [37]:
Lasso_df.to_csv(r'Lasso_df.csv', index=False)

### Improvement Attempts - Lasso / Ridge Regression Models

In [38]:
#Trying out other alphas in ridge regression & using RidgeCV
alphas = [.01,.1,1,10,100,1000,10000]
ridge = RidgeCV(alphas = alphas, cv = 5)
ridge_fit = ridge.fit(X_train, y_train)
ypred_ridge = ridge_fit.predict(X_test)
ridge.score(X_train,y_train)

0.8701040063806589

In [39]:
print("MSE: {}".format(mean_squared_error((y_test),(ypred_ridge))))

MSE: 0.13799095519959106
