In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data=pd.read_csv('diamonds.csv')
data.rename(columns={'x':'diamond_length_mm',
                     'y':'diamond_width_mm',
                     'z':'diamond_depth_mm'},inplace=True)
data=data.reindex(columns=['carat', 'cut', 'color', 'clarity', 'depth', 'table', 
       'diamond_length_mm', 'diamond_width_mm', 'diamond_depth_mm','price'])

In [None]:
data.head(2)

In [None]:
data.shape

In [None]:
data.drop_duplicates(keep=False,inplace=True)
dataset=data.copy()

In [None]:
sns.displot(dataset['price'],kind='kde')

In [None]:
from scipy import stats
z_test=np.abs(stats.zscore(dataset['price']))
outlier_mask=z_test>2
data_no_out=dataset[~outlier_mask]

In [None]:
data_no_out.shape

In [None]:
sns.displot(data_no_out['price'],kind='kde')

In [None]:
ScalData=data_no_out.copy()

In [None]:
sns.heatmap(ScalData.corr(),annot=True, cmap="YlGnBu")

In [None]:
def spearman_correlation(data1, data2, title):
    print("-" * 15, title, "-" * 15)
    print()
    # calculate Spearman's Correlation
    coef, p = stats.spearmanr(data1, data2)
    print(f"Spearmans correlation coefficient: {coef:.3f}")
    print()
    # interpret the significance
    alpha = 0.05
    if (p > alpha):
        print(f"Samples are uncorrelated (fail to reject H0) p={p:.3f}")
    else:
        print(f"Samples are correlated (reject H0) p={p:.3f}")
    print()
    
def pearson_correlation(data1, data2, title):
    print("-" * 15, title, "-" * 15)
    print()
    # calculate Pearson's Correlation
    coef, p = stats.pearsonr(data1, data2)
    print(f"Pearson correlation coefficient: {coef:.3f}")
    print()
    # interpret the significance
    alpha = 0.05
    if (p > alpha):
        print(f"Samples are uncorrelated (fail to reject H0) p={p:.3f}")
    else:
        print(f"Samples are correlated (reject H0) p={p:.3f}")
    print()

In [None]:
#numeric values analysis
pearson_correlation(ScalData["carat"], ScalData["price"], "pearson Correlation : carat")
pearson_correlation(ScalData["depth"], ScalData["price"], "pearson Correlation : depth")
pearson_correlation(ScalData["diamond_length_mm"], ScalData["price"], "pearson Correlation : diamond_length_mm")
pearson_correlation(ScalData["diamond_width_mm"], ScalData["price"], "pearson Correlation : diamond_width_mm")
pearson_correlation(ScalData["diamond_depth_mm"], ScalData["price"], "pearson Correlation : diamond_depth_mm")

In [None]:
#categorical variable
spearman_correlation(data1=ScalData["cut"], data2=ScalData["price"], title="Spearman correlation : cut")
spearman_correlation(data1=ScalData["color"], data2=ScalData["price"], title="Spearman correlation : color")
spearman_correlation(data1=ScalData["clarity"], data2=ScalData["price"], title="Spearman correlation : clarity")

In [None]:
new_data=ScalData.drop(['depth'],axis=1,inplace=False)

In [None]:
from sklearn.preprocessing import LabelEncoder
LabelEnc=LabelEncoder()
new_data['cut']=LabelEnc.fit_transform(new_data['cut'])
new_data['color']=LabelEnc.fit_transform(new_data['color'])
new_data['clarity']=LabelEnc.fit_transform(new_data['clarity'])

In [None]:
new_data.head(1)

In [None]:
X=new_data.iloc[:,:-1]
y=new_data.iloc[:,-1]

In [None]:
#from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#scaler=StandardScaler()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
#X_train=scaler.fit_transform(X_train)
#X_test=scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
reg=RandomForestRegressor()

In [None]:
reg.fit(X_train,y_train)

In [None]:
print(reg.coef_)
print(reg.intercept_)

In [None]:
reg.get_params()

In [None]:
reg_pred=reg.predict(X_test)
plt.scatter(y_test,reg_pred)

In [None]:
sns.displot(y_test-reg_pred,kind='kde')

In [None]:
residual=y_test-reg_pred
plt.scatter(reg_pred,residual)

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
print('MAE: ',mean_absolute_error(y_test,reg_pred))
print('MSE: ',mean_squared_error(y_test,reg_pred))
print('RMSE: ',np.sqrt(mean_squared_error(y_test,reg_pred)))

In [None]:
from sklearn.metrics import r2_score
score=r2_score(y_test,reg_pred)
print(score)

In [None]:
adjusted_r2=1-(1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
adjusted_r2

In [None]:
new_data.head(1)

In [None]:
input_data = pd.DataFrame({
    'carat': [0.23, 0.21],
    'cut': ['2', '1'],
    'color': ['1', '1'],
    'clarity': ['3', '1'],
    'table': [55.0, 61.0],
    'diamond_length_mm': [3.95, 3.89],
    'diamond_width_mm': [3.98, 3.84],
    'diamond_depth_mm': [2.43, 2.31]
})


#input_data['cut']=LabelEnc.transform(input_data['cut'])
#input_data['color']=LabelEnc.transform(input_data['color'])
#input_data['clarity']=LabelEnc.transform(input_data['clarity'])

#input_data=scaler.transform(input_data)
#input_data=input_data.reshape(1,-1).shape



In [None]:
reg.predict(input_data)

In [None]:
y_train.min()

In [None]:
data.columns

In [None]:
data['cut'].value_counts()