In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns   
import os


In [37]:
os.getcwd() #get current working directory in python in order to read the file from the correct location.
data = pd.read_csv('c:\\Users\\Agent Breslin\\Desktop\\AI-Python\\Data_Set\\diamond.csv')
print(data.head(11))

    Unnamed: 0  carat colour clarity certification  price
0            1   0.30      D     VS2           GIA   1302
1            2   0.30      E     VS1           GIA   1510
2            3   0.30      G    VVS1           GIA   1510
3            4   0.30      G     VS1           GIA   1260
4            5   0.31      D     VS1           GIA   1641
5            6   0.31      E     VS1           GIA   1555
6            7   0.31      F     VS1           GIA   1427
7            8   0.31      G    VVS2           GIA   1427
8            9   0.31      H     VS2           GIA   1126
9           10   0.31      I     VS1           GIA   1126
10          11   0.32      F     VS1           GIA   1468


In [38]:
x_list = data[['carat', 'price']] # creating a subset of the data consisting of the columns carat and price
x = x_list
x

Unnamed: 0,carat,price
0,0.30,1302
1,0.30,1510
2,0.30,1510
3,0.30,1260
4,0.31,1641
...,...,...
303,1.01,8175
304,1.02,10796
305,1.06,9890
306,1.02,8959


In [39]:
y = data.certification == 'GIA' # checking the CERTIFICATION column for the value GIA is present in it and returning a boolean value
print(y)

0       True
1       True
2       True
3       True
4       True
       ...  
303    False
304    False
305    False
306    False
307    False
Name: certification, Length: 308, dtype: bool


In [40]:
#binary 1-True 0-False
y = (data.certification == 'GIA') * 1 # converting the boolean value to binary soo instead of True and False we have 1 and 0 in place
y

0      1
1      1
2      1
3      1
4      1
      ..
303    0
304    0
305    0
306    0
307    0
Name: certification, Length: 308, dtype: int32

In [41]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 50)

# This line is calling the train_test_split function with x and y as the features and labels, respectively. The test_size parameter is set to 0.3, which means that 30% of the data will be used for testing, and the remaining 70% will be used for training. The random_state parameter is set to 50 to ensure that the data is split in the same way each time the code is run.

In [42]:
gini_model = DecisionTreeClassifier(criterion = 'gini', max_depth = 7, max_leaf_nodes = 10, min_samples_leaf = 10, random_state = 42)
gini_model.fit(x_train, y_train) #WE FIT THE MODEL TO THE TRAINING DATA TO LEARN FROM IT AND MAKE PREDICTIONS.

DecisionTreeClassifier(max_depth=7, max_leaf_nodes=10, min_samples_leaf=10,
                       random_state=42)

In [43]:
#Time to test our model based on the data it has learned from the training data.
predictions = gini_model.predict(x_test)#making predictions on the test data from what the model has learned from the training data
predictions


array([0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0])

In [44]:
probability = gini_model.predict_proba(x_test) # This line is asking the model (which we've named gini_model) to estimate the probabilities of the possible outcomes for the x_test data. 
probability # The output is an array of two columns, with each row representing the probabilities of the two possible outcomes (0 and 1) for a single observation in the x_test data.

array([[0.5106383 , 0.4893617 ],
       [0.34210526, 0.65789474],
       [0.83333333, 0.16666667],
       [0.18181818, 0.81818182],
       [0.83333333, 0.16666667],
       [0.07142857, 0.92857143],
       [0.18181818, 0.81818182],
       [0.83333333, 0.16666667],
       [1.        , 0.        ],
       [0.34210526, 0.65789474],
       [0.34210526, 0.65789474],
       [0.07142857, 0.92857143],
       [0.76923077, 0.23076923],
       [0.34210526, 0.65789474],
       [0.07142857, 0.92857143],
       [0.5       , 0.5       ],
       [0.83333333, 0.16666667],
       [0.18181818, 0.81818182],
       [0.2       , 0.8       ],
       [1.        , 0.        ],
       [0.34210526, 0.65789474],
       [0.18181818, 0.81818182],
       [0.83333333, 0.16666667],
       [0.83333333, 0.16666667],
       [1.        , 0.        ],
       [0.53846154, 0.46153846],
       [0.53846154, 0.46153846],
       [0.53846154, 0.46153846],
       [0.5106383 , 0.4893617 ],
       [0.53846154, 0.46153846],
       [0.

In [45]:
report = classification_report(y_test, predictions) #This line is creating a report that tells us how well our model did on its predictions.
print(report)

#classification_report is a function that compares the true outcomes (y_test) with the outcomes our model predicted (predictions).

              precision    recall  f1-score   support

           0       0.54      0.77      0.64        44
           1       0.67      0.41      0.51        49

    accuracy                           0.58        93
   macro avg       0.60      0.59      0.57        93
weighted avg       0.61      0.58      0.57        93



In [46]:
accuracy = accuracy_score(y_test, predictions) #accuracy = accuracy_score(y_test, predictions): This line is calculating how often the model's predictions (predictions) were correct.
print((accuracy * 100), "%")

58.06451612903226 %


In [47]:
precision = precision_score(y_test, predictions)
print((precision * 100), "%")

66.66666666666666 %


In [48]:
matrix = confusion_matrix(y_test, predictions, labels = [1,0])
print(matrix)

[[20 29]
 [10 34]]


In [49]:
x_influences = pd.DataFrame(gini_model.feature_importances_, index = x.columns)
x_influences

Unnamed: 0,0
carat,0.674456
price,0.325544
