In [1]:
%matplotlib inline

from pathlib import Path
from sklearn.metrics import auc
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree, export_text
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pylab as plt
from dmba import plotDecisionTree, classificationSummary, regressionSummary

no display found. Using non-interactive Agg backend


In [2]:
# Working directory:
#
# We assume that data are kept in the same directory as the notebook. If you keep your 
# data in a different folder, replace the argument of the `Path`
DATA = Path('/Users/mattpeltier/Downloads/dmba/')

The goal is to use these data to build a model that will classify auctions as competitive ro non-competitive.  A competitive auction is defined as an auction with at least two bids placed on the item auctioned.  The data include variables that describe the item(auction category), the seller, and the auction terms that the seller selected auction duration, opening price, currency, day of week fo auction close.  The task is to predict whether or not the auction will be competitive.  

In [3]:
ebay_df = pd.read_csv(DATA / 'eBayAuctions.csv')
new_df = ebay_df

In [63]:
ebay_df.dtypes

Category          object
currency          object
sellerRating       int64
Duration        category
endDay            object
ClosePrice       float64
OpenPrice        float64
Competitive?       int64
dtype: object

9.1 DATA PREPROCESSING convert Duration to categorical then split
data into training and validation datasets
Converted variable duration into a categorical variable

In [5]:
 #convert to Duration to categorical
new_df['Duration'] = ebay_df['Duration'].astype('category')
#ebay_df['Duration']

In [6]:
new_df.dtypes

Category          object
currency          object
sellerRating       int64
Duration        category
endDay            object
ClosePrice       float64
OpenPrice        float64
Competitive?       int64
dtype: object

In [7]:
#checking for missing value 
new_df['Duration'].unique()

[5, 7, 1, 3, 10]
Categories (5, int64): [1, 3, 5, 7, 10]

In [8]:
new_df['Competitive?'].unique()

array([0, 1])

In [9]:
new_df['OpenPrice'].value_counts()

1.23      200
0.01      157
0.99      118
9.99      108
2.45      105
         ... 
24.77       1
5.95        1
5.52        1
26.99       1
999.00      1
Name: OpenPrice, Length: 291, dtype: int64

In [10]:
new_df['endDay'].unique()

array(['Mon', 'Tue', 'Fri', 'Thu', 'Sat', 'Sun', 'Wed'], dtype=object)

In [11]:
new_df['ClosePrice'].value_counts()

1.23      58
2.45      44
9.99      39
6.50      35
4.99      30
          ..
67.66      1
8.20       1
18.61      1
40.91      1
999.00     1
Name: ClosePrice, Length: 827, dtype: int64

In [12]:
columns= list(new_df.columns)
columns

['Category',
 'currency',
 'sellerRating',
 'Duration',
 'endDay',
 'ClosePrice',
 'OpenPrice',
 'Competitive?']

In [13]:
#change object values to binary values with get dummies
#ONE HOT CODING
df = pd.DataFrame(new_df)
df = pd.get_dummies(df, columns=["Category", "currency", "endDay"])
columns = list(df.columns)
columns.remove('ClosePrice')
columns.remove('endDay_Mon')
columns.remove('endDay_Fri')
columns.remove('endDay_Sat')
columns.remove('endDay_Sun')
columns.remove('endDay_Thu')
columns.remove('endDay_Tue')
columns.remove('endDay_Wed')
df = df[columns]
df

Unnamed: 0,sellerRating,Duration,OpenPrice,Competitive?,Category_Antique/Art/Craft,Category_Automotive,Category_Books,Category_Business/Industrial,Category_Clothing/Accessories,Category_Coins/Stamps,...,Category_Home/Garden,Category_Jewelry,Category_Music/Movie/Game,Category_Photography,Category_Pottery/Glass,Category_SportingGoods,Category_Toys/Hobbies,currency_EUR,currency_GBP,currency_US
0,3249,5,0.01,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,3249,5,0.01,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,3249,5,0.01,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,3249,5,0.01,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,3249,5,0.01,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1967,2992,5,359.95,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1968,21,5,300.00,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1969,1400,5,549.00,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1970,57,7,650.00,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [14]:

predictors = ['sellerRating', 'Duration','Category_Antique/Art/Craft', 'Category_Automotive', 'Category_Books',
       'Category_Business/Industrial', 'Category_Clothing/Accessories',
       'Category_Coins/Stamps', 'Category_Collectibles', 'Category_Computer',
       'Category_Electronics', 'Category_EverythingElse',
       'Category_Health/Beauty', 'Category_Home/Garden', 'Category_Jewelry',
       'Category_Music/Movie/Game', 'Category_Photography',
       'Category_Pottery/Glass', 'Category_SportingGoods',
       'Category_Toys/Hobbies', 'currency_EUR', 'currency_GBP', 'currency_US']
outcome = ['Competitive?']

# partition data
X = pd.get_dummies(df[predictors], drop_first=False)
y = df[outcome]



9.1 PREPROCESSING SPLIT THE DATA INTO TRAINING %60 AND VALIDATION %40 SETS

In [15]:
trainData, validData = train_test_split(df, test_size = .4, random_state = 1)
# Print the shapes of the training and validation sets to verify the split
print("Training set :", trainData.shape)
print("Validation set :", validData.shape)
trainDf = trainData
validDf = validData


Training set : (1183, 25)
Validation set : (789, 25)


9.1A- FIT CLASSIFICATION TREE USING ALL PREDICTORS TO AVOID OVERFITTING, SET THE MINIMUM NUMBER OF RECORDS IN A TERMINAL NODE TO 50 AND MAX TREE DEPTH TO 7.  WRITE DOWN RESUKTS IN TERMS OF RULES.

In [16]:
classTree = DecisionTreeClassifier(criterion="entropy", max_depth=7, min_samples_split=50)
classTree.fit(trainDf.drop(columns=['Competitive?']), trainDf['Competitive?'])
#print("Classes: {}".format(', '.join(classTree.classes_)))

DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_split=50)

9.1A fit classification tree using all predictors to avoid overfitting set minimum # of records to 50
and maximum tree depth to 7, the results in terms of rules were to change all 'Duration' variables >2=1 and less <2=0o dummy variables 
represented by 0, 1 one hot coding was used to change category, endDay, and currency columns to dummy variables.
CLASSIFICATION TREE SHOWN

In [17]:
#Plot the decision tree
plt.figure(figsize=(150, 100))
tree.plot_tree(classTree)
plt.show()


  plt.show()


In [18]:

#SPLITTING THE DATA INTO DEPENDENT AND INDEPENDENT VARIABLES
#DURATION IS THE NUMBER OF BIDS, IF BIDS ATLEAST 2 
#THEN AUCTION IS DEFINED AS COMPETITIVE
X = trainDf.drop(columns= 'Competitive?')
y = trainDf['Competitive?']


In [19]:
X

Unnamed: 0,sellerRating,Duration,OpenPrice,Category_Antique/Art/Craft,Category_Automotive,Category_Books,Category_Business/Industrial,Category_Clothing/Accessories,Category_Coins/Stamps,Category_Collectibles,...,Category_Home/Garden,Category_Jewelry,Category_Music/Movie/Game,Category_Photography,Category_Pottery/Glass,Category_SportingGoods,Category_Toys/Hobbies,currency_EUR,currency_GBP,currency_US
503,578,10,2.45,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
733,2349,7,3.60,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
383,884,10,2.45,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
725,2349,7,3.60,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
310,104,7,1.23,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1791,2427,3,33.95,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1096,2046,5,7.50,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1932,534,7,79.99,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
235,1853,10,1.23,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [20]:
y

503     1
733     1
383     1
725     1
310     1
       ..
1791    0
1096    0
1932    1
235     0
1061    0
Name: Competitive?, Length: 1183, dtype: int64

In [21]:
# Change values of 'Duration' column
y2 = y>1

y[y2]= 1
y.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y2]= 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


1    630
0    553
Name: Competitive?, dtype: int64

SPLITTING THE DATA INTO TRAINING AND VALIDATION SETS

In [22]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

In [23]:
fullClassTree = DecisionTreeClassifier(random_state = 1)
fullClassTree.fit(train_X, train_y)
#plotDecisionTree(fullClassTree, feature_names=train_X.columns)
tree.plot_tree(fullClassTree)
plt.show()


  plt.show()


fit classification tree using all predictors

In [24]:
print(export_text(fullClassTree,
                  #feature_names=train_X.columns,
                  show_weights=True))

|--- feature_2 <= 3.72
|   |--- feature_2 <= 1.04
|   |   |--- feature_5 <= 0.50
|   |   |   |--- feature_0 <= 4363.50
|   |   |   |   |--- feature_2 <= 0.94
|   |   |   |   |   |--- feature_4 <= 0.50
|   |   |   |   |   |   |--- feature_0 <= 3120.00
|   |   |   |   |   |   |   |--- weights: [0.00, 13.00] class: 1
|   |   |   |   |   |   |--- feature_0 >  3120.00
|   |   |   |   |   |   |   |--- weights: [5.00, 40.00] class: 1
|   |   |   |   |   |--- feature_4 >  0.50
|   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |--- feature_2 >  0.94
|   |   |   |   |   |--- feature_0 <= 2457.50
|   |   |   |   |   |   |--- feature_16 <= 0.50
|   |   |   |   |   |   |   |--- weights: [0.00, 9.00] class: 1
|   |   |   |   |   |   |--- feature_16 >  0.50
|   |   |   |   |   |   |   |--- weights: [2.00, 3.00] class: 1
|   |   |   |   |   |--- feature_0 >  2457.50
|   |   |   |   |   |   |--- feature_19 <= 0.50
|   |   |   |   |   |   |   |--- feature_1 <= 5.00
|   |   |   

9.2B-  the accuracy of the training dataset is more accurate than the validation data set by %24.11
the training data set has %99.44 accuracy and the validation set has %75.32 accuracy the training set is practical for making a prediction on a new auction to be competitive.

9.1C the rules set up a correlation between duration and competitive? the bin created form competitive greater than 2 is successful made a more accurate decision tree node, columns the category, endDay, and currency columns only made the tree larger the data is not useful for prediction.

In [25]:
print("TRAINING SET")
classificationSummary(train_y, fullClassTree.predict(train_X))
print('')
print('VALIDATION SET')
classificationSummary(valid_y, fullClassTree.predict(valid_X))
.9944-.7532

TRAINING SET
Confusion Matrix (Accuracy 0.9281)

       Prediction
Actual   0   1
     0 309  17
     1  34 349

VALIDATION SET
Confusion Matrix (Accuracy 0.7004)

       Prediction
Actual   0   1
     0 148  79
     1  63 184


0.24119999999999997

9.1D- fit a new classification tree this time with only predictors than can be used for modeling a new auction.  

In [26]:
#create new dataframe fit a new classification tree
newdf = df[['sellerRating', 'Duration', 'Competitive?', 'currency_EUR', 'currency_GBP', 'currency_US', 'OpenPrice']]
trainData, validData = train_test_split(newdf, test_size = .4, random_state = 1)
# Print the shapes of the training and validation sets to verify the split
print("Training set :", trainData.shape)
print("Validation set :", validData.shape)
#Plot the decision tree


Training set : (1183, 7)
Validation set : (789, 7)


In [27]:

predictors = ['sellerRating', 'Duration', 'currency_EUR', 'currency_GBP', 'currency_US', 'OpenPrice']
outcome = ['Competitive?']
X1 = newdf[predictors]
y1 = newdf[outcome]
train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X1, y1, test_size=0.4, random_state=1)


In [28]:
classTree = DecisionTreeClassifier(criterion="entropy", max_depth=7, min_samples_split=50)
classTree.fit(newdf.drop(columns=['Competitive?']), newdf['Competitive?'])
plt.figure(figsize=(150, 50))
tree.plot_tree(classTree)
plt.show()

  plt.show()


In [29]:
print("TRAINING SET")
classificationSummary(train_y1, classTree.predict(train_X1))
print('')
print('')
classificationSummary(valid_y1, classTree.predict(valid_X1))
75.99-75.54

TRAINING SET
Confusion Matrix (Accuracy 0.7599)

       Prediction
Actual   0   1
     0 452 101
     1 183 447


Confusion Matrix (Accuracy 0.7554)

       Prediction
Actual   0   1
     0 293  60
     1 133 303


0.44999999999998863

In [30]:
scat_X1= newdf.iloc[:,0:10]
#scat_X1['sellerRating'].value_counts() 
scat_X1.head()

Unnamed: 0,sellerRating,Duration,Competitive?,currency_EUR,currency_GBP,currency_US,OpenPrice
0,3249,5,0,0,0,1,0.01
1,3249,5,0,0,0,1,0.01
2,3249,5,0,0,0,1,0.01
3,3249,5,0,0,0,1,0.01
4,3249,5,0,0,0,1,0.01


the prediction accuracy drops by %1.56 from the training set to the validation set not a dependable prediction
if had to choose training set has better accuracy.


In [31]:
import matplotlib.pyplot as plt

# plot seller rating vs. Openprice
fig, ax = plt.subplots()

# Plotting subset where 'Competitive?' is 0 with marker 'o'
subset = new_df.loc[new_df['Competitive?'] == 0]
ax.scatter(subset.sellerRating, subset.OpenPrice, marker='o', label='Not Competitive')

# Plotting subset where 'Competitive?' is 1 with marker 'D'
subset1 = new_df.loc[new_df['Competitive?'] == 1]
ax.scatter(subset1.sellerRating, subset1.OpenPrice, marker='D', label='Competitive')

plt.xlabel('sellerRating') 
plt.ylabel('OpenPrice')

# Ensure legend is correctly displayed
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels)

plt.show()

  plt.show()


In [32]:
rf = RandomForestClassifier(n_estimators=500, random_state=1)
rf.fit(train_X1, train_y1)
importances = rf.feature_importances_
std = np.std([classTree.feature_importances_ for classTree in rf.estimators_], axis = 0)
DF = pd.DataFrame({'FEATURE': train_X1.columns, 'IMPORTANCE': importances, 'STD': std})
DF = DF.sort_values('IMPORTANCE', ascending= False)
print(DF)
classificationSummary(valid_y1, rf.predict(valid_X1))

  rf.fit(train_X1, train_y1)


        FEATURE  IMPORTANCE       STD
0  sellerRating    0.465014  0.043217
5     OpenPrice    0.441628  0.040639
1      Duration    0.062624  0.017985
2  currency_EUR    0.011723  0.009289
4   currency_US    0.009852  0.008045
3  currency_GBP    0.009159  0.006332
Confusion Matrix (Accuracy 0.7440)

       Prediction
Actual   0   1
     0 271  82
     1 120 316


In [33]:
# partition data
X = newdf.drop(columns=['Competitive?'])
y = newdf['Competitive?']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=1)
type(y)
type(X)

pandas.core.frame.DataFrame

In [34]:
classTree = DecisionTreeClassifier(criterion="entropy", random_state=1)
classTree.fit(X_train, y_train)

classes = classTree.classes_
classificationSummary(y_valid, classTree.predict(X_valid), class_names=classTree.classes_)


Confusion Matrix (Accuracy 0.7199)

       Prediction
Actual   0   1
     0 266  87
     1 134 302


In [35]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dmba import liftChart
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dmba import liftChart
# Make predictions on the validation set
predicted_proba = classTree.predict_proba(X_valid)
predicted_labels = classTree.predict(X_valid)

# Create a DataFrame with actual labels, predicted probabilities, and predicted labels
result = pd.DataFrame({'actual': y_valid,
                       'predicted': predicted_labels,
                       'p(0)': predicted_proba[:, 0],
                       'p(1)': predicted_proba[:, 1]})
liftChart(result.actual, labelBars=False)
plt.show()

  plt.show()


The scatter plot tells us that if the closing price of an item is over 200 
then the auction has a greater probability of competitiveness.

In [36]:

from sklearn.linear_model import LinearRegression
#train linear regression model
reg = LinearRegression()
reg.fit(train_X,train_y)
#evaluating performance
#training
regressionSummary(train_y, reg.predict(train_X))
#validation
regressionSummary(valid_y, reg.predict(valid_X))



Regression statistics

               Mean Error (ME) : -0.0000
Root Mean Squared Error (RMSE) : 0.4652
     Mean Absolute Error (MAE) : 0.4332

Regression statistics

               Mean Error (ME) : -0.0157
Root Mean Squared Error (RMSE) : 0.4827
     Mean Absolute Error (MAE) : 0.4500


In [37]:
df_sorted = trainDf.sort_values(by='Duration', ascending=False)

# Calculating cumulative number of positive outcomes
cumulative_positives = df_sorted['OpenPrice'].cumsum()

# Calculating cumulative lift
cumulative_lift = cumulative_positives / np.arange(1, len(trainDf)+1)

# Plotting the gains chart
plt.figure(figsize=(10, 6))
plt.plot(np.arange(1, len(trainDf)+1), cumulative_positives, label='Cumulative Positive Outcomes')
plt.plot(np.arange(1, len(trainDf)+1), np.arange(1, len(trainDf)+1), linestyle='--', color='gray', label='Random Model')
plt.xlabel('Percentile')
plt.ylabel('Cumulative Positive Outcomes')
plt.title('Gains Chart')
plt.legend()
plt.grid(True)
plt.show()

  plt.show()


In [38]:
plt.figure(figsize=(10, 6))
plt.plot(np.arange(1, len(trainDf)+1), cumulative_lift, label='Cumulative Lift')
plt.xlabel('Percentile')
plt.ylabel('Cumulative Lift')
plt.title('Lift Chart')
plt.legend()
plt.grid(True)
plt.show()

  plt.show()


9.2-The variable we are trying to predict is whether or not a flight is delayed.  A delay is defined as an arrival that is atleast 15 minutes later than scheduled.

DATA PREPROCESSING TRANSFORM VARIABLE COLUMN(DAY_WEEK) INTO A CATEGORICAL VARIABLE, 
BIN THE SCHEDULED DEPARTURE TIME INTO EIGHT BINS. USE THESE AND ALL OTHER COLUMNS AS PREDICTORS EXCLUDING (DAY_OF_MONTH)

In [39]:
flights_df = pd.read_csv(DATA / 'flightDelays.csv')

In [40]:
# Drop the 'DAY_OF_MONTH' column AND 'DEP_TIME'
flights_df = flights_df.drop(columns=['DAY_OF_MONTH', 'DEP_TIME'])

In [41]:
flights_df.dtypes

CRS_DEP_TIME      int64
CARRIER          object
DEST             object
DISTANCE          int64
FL_DATE          object
FL_NUM            int64
ORIGIN           object
Weather           int64
DAY_WEEK          int64
TAIL_NUM         object
Flight Status    object
dtype: object

In [42]:
#CONVERT 'DAY_WEEK' INTO CATEGORICAL VARIABLE
flights_df['DAY_WEEK'] = flights_df['DAY_WEEK'].astype('category')

In [43]:
#CREATE BINS FOR 'CRS_DEP_TIME' CHANGING MILITARY TIMES INTO BINNED VALUES
flights_df['binned_CRS_DEP_TIME'] = pd.cut(flights_df.CRS_DEP_TIME, 8, labels=False)
flights_df['binned_CRS_DEP_TIME'].value_counts()

5    419
4    390
3    299
1    270
0    261
6    204
7    190
2    168
Name: binned_CRS_DEP_TIME, dtype: int64

In [44]:
flights_df['DEST'].value_counts()

LGA    1150
EWR     665
JFK     386
Name: DEST, dtype: int64

In [45]:
flights_df['ORIGIN'].value_counts()

DCA    1370
IAD     686
BWI     145
Name: ORIGIN, dtype: int64

In [46]:
df = pd.get_dummies(flights_df, columns=['DAY_WEEK', 'CARRIER', 'DEST', 'FL_DATE', 'ORIGIN', 'TAIL_NUM', 'Flight Status'])

# Now, df_encoded contains binary columns for each category in the 'CARRIER' column
# You can use df_encoded for training your decision tree model
#df[['DAY_WEEK_1']]
#df.iloc[:,47:58]

9.2Afit classification tree with flight delay variable using all relevanet predictors with rules of max depth 8 and minimum impurity .01 from the training data changed all predictor variable to binary coding, crs_dep_time was also change to a bin
according to the hour.

In [47]:
#partition the data into training(%60) and validation(%40) sets
trainData, validData = train_test_split(df, test_size=0.4, random_state=1)
trainDF=trainData
print(trainData.shape, validData.shape)
X = df.drop(columns=['Flight Status_ontime', 'Flight Status_delayed'])  # Features
y = df['Flight Status_delayed']  # Target variable, assuming 'Flight Status' is the original column name
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=1)
fullClassTree = DecisionTreeClassifier(criterion="entropy", max_depth=8,  min_impurity_decrease=0.01,random_state=1)
fullClassTree.fit(X_train, y_train)
#Plot the decision tree
plt.figure(figsize=(10, 5))
tree.plot_tree(fullClassTree)
plt.show()

(1320, 608) (881, 608)


  plt.show()


In [48]:
fly_df= df[['binned_CRS_DEP_TIME', 'DAY_WEEK_1', 'DEST_EWR', 'ORIGIN_DCA', 'Flight Status_ontime', 'Flight Status_delayed']]
fly_df

Unnamed: 0,binned_CRS_DEP_TIME,DAY_WEEK_1,DEST_EWR,ORIGIN_DCA,Flight Status_ontime,Flight Status_delayed
0,4,0,0,0,1,0
1,5,0,0,1,1,0
2,3,0,0,0,1,0
3,5,0,0,0,1,0
4,2,0,0,0,1,0
...,...,...,...,...,...,...
2196,0,0,1,1,1,0
2197,5,0,1,0,1,0
2198,5,0,1,1,1,0
2199,3,0,1,1,1,0


In [49]:
trainData, validData = train_test_split(fly_df, test_size=0.4, random_state=1)

print(trainData.shape, validData.shape)
X = fly_df.drop(columns=['Flight Status_ontime', 'Flight Status_delayed'])  # Features
y = fly_df['Flight Status_delayed']  # Target variable,
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=1)
fullClassTree = DecisionTreeClassifier(criterion="entropy", max_depth=8,  min_impurity_decrease=0.01,random_state=1)
fullClassTree.fit(X_train, y_train)
#Plot the decision tree
plt.figure(figsize=(10, 5))
tree.plot_tree(fullClassTree)
plt.show()

(1320, 6) (881, 6)


  plt.show()


In [50]:
print(export_text(fullClassTree,
                  #feature_names=train_X.columns,
                  show_weights=True))

|--- feature_0 <= 3.50
|   |--- weights: [537.00, 84.00] class: 0
|--- feature_0 >  3.50
|   |--- weights: [522.00, 177.00] class: 0



In [51]:
fullClassTree = DecisionTreeClassifier(criterion="entropy", max_depth=8,  min_impurity_decrease=0.01,random_state=1)
fullClassTree.fit(X_valid, y_valid)
#Plot the decision tree
plt.figure(figsize=(10, 5))
tree.plot_tree(fullClassTree)
plt.show()
print(export_text(fullClassTree,
                  #feature_names=train_X.columns,
                  show_weights=True))

|--- weights: [714.00, 167.00] class: 0



  plt.show()


In [52]:
classificationSummary(y_train, fullClassTree.predict(X_train))
print('')
print('')
classificationSummary(y_valid, fullClassTree.predict(X_valid))

Confusion Matrix (Accuracy 0.8023)

       Prediction
Actual    0    1
     0 1059    0
     1  261    0


Confusion Matrix (Accuracy 0.8104)

       Prediction
Actual   0   1
     0 714   0
     1 167   0


9.2B the tree would not be useful from the data would nee more labels to show detailed information
not enough information from the training and validation set, yes the probability is available 
with a %80.23 accuracy from training set and 81.04 from valdation set
9.2 C fit the same tree excluding weather prediction

In [53]:
small_df = df.drop(columns=['Weather'])

In [54]:
trainData, validData = train_test_split(small_df, test_size=0.4, random_state=1)

print(trainData.shape, validData.shape)
X = small_df.drop(columns=['Flight Status_ontime', 'Flight Status_delayed'])  # Features
y = small_df['Flight Status_delayed']  # Target variable, assuming 'Flight Status' is the original column name
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=1)
fullClassTree = DecisionTreeClassifier(criterion="entropy", max_depth=8,  min_impurity_decrease=0.01,random_state=1)
fullClassTree.fit(X_train, y_train)
#Plot the decision tree
plt.figure(figsize=(10, 5))
tree.plot_tree(fullClassTree)
plt.show()

(1320, 607) (881, 607)


  plt.show()


In [55]:
print(export_text(fullClassTree,
                  #feature_names=train_X.columns,
                  show_weights=True))

|--- feature_2 <= 2257.50
|   |--- feature_2 <= 1609.50
|   |   |--- feature_0 <= 1377.50
|   |   |   |--- weights: [56.00, 10.00] class: 0
|   |   |--- feature_0 >  1377.50
|   |   |   |--- weights: [17.00, 22.00] class: 1
|   |--- feature_2 >  1609.50
|   |   |--- weights: [474.00, 40.00] class: 0
|--- feature_2 >  2257.50
|   |--- weights: [512.00, 189.00] class: 0



In [56]:
toyota_df = pd.read_csv(DATA / 'ToyotaCorolla.csv')


In [57]:
trainData, validData = train_test_split(small_df, test_size=0.4, random_state=1)

print(trainData.shape, validData.shape)
X = small_df.drop(columns=['Flight Status_ontime', 'Flight Status_delayed'])  # Features
y = small_df['Flight Status_delayed']  # Target variable, assuming 'Flight Status' is the original column name
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=1, random_state=1)
smallTree = DecisionTreeClassifier(criterion="entropy", max_depth=1, min_impurity_decrease=0.01)
smallTree.fit(X_train, y_train)

# Plot the decision tree
plt.figure(figsize=(10, 5))
plot_tree(smallTree)
plt.show()


(1320, 607) (881, 607)


  plt.show()


In [58]:
toyotaCorolla_df = pd.read_csv(DATA / 'ToyotaCorolla.csv').iloc[:1000,:]
toyotaCorolla_df = toyotaCorolla_df.rename(columns={'Age_08_04': 'Age', 'Quarterly_Tax': 'Tax'})

predictors = ['Age', 'KM', 'Fuel_Type', 'HP', 'Automatic', 'Doors', 'Tax', 'Mfr_Guarantee', 'Guarantee_Period', 'Airco', 
              'Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model', 'Tow_Bar'] 
outcome = ['Price']

X = pd.get_dummies(toyotaCorolla_df[predictors], drop_first=True)
y = toyotaCorolla_df[outcome]

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)
# use grid search to find optimized tree
param_grid = {
    'max_depth': [5, 10, 15, 20, 25],  
    'min_samples_split': [10, 20, 30, 40, 50],
    'min_impurity_decrease': [0, 0.001, 0.005, 0.01],
}
gridSearch = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(train_X, train_y)
print('Initial score: ', gridSearch.best_score_)
print('Initial parameters: ', gridSearch.best_params_)


param_grid = {
    'max_depth': list(range(2, 16)),  
    'min_samples_split': list(range(10,22)), 
    'min_impurity_decrease': [0.0009, 0.001, 0.0011],
}
gridSearch = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(train_X, train_y)
print('Improved score: ', gridSearch.best_score_)
print('Improved parameters: ', gridSearch.best_params_)

bestClassTree = gridSearch.best_estimator_
print('Training Set')
regressionSummary(train_y, bestClassTree.predict(train_X))
print()
print('Validation Set')
regressionSummary(valid_y, bestClassTree.predict(valid_X))



Initial score:  0.8397949905785465
Initial parameters:  {'max_depth': 5, 'min_impurity_decrease': 0.01, 'min_samples_split': 10}
Improved score:  0.8462828452936046
Improved parameters:  {'max_depth': 6, 'min_impurity_decrease': 0.001, 'min_samples_split': 11}
Training Set

Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 995.9067
            Mean Absolute Error (MAE) : 738.5757
          Mean Percentage Error (MPE) : -0.7719
Mean Absolute Percentage Error (MAPE) : 6.6060

Validation Set

Regression statistics

                      Mean Error (ME) : 33.0162
       Root Mean Squared Error (RMSE) : 1443.0202
            Mean Absolute Error (MAE) : 975.3079
          Mean Percentage Error (MPE) : -1.2384
Mean Absolute Percentage Error (MAPE) : 8.8731


In [59]:
toyotaCorolla_df

Unnamed: 0,Id,Model,Price,Age,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,0,0,0,1,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
2,3,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,...,0,1,0,0,0,1,0,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,...,1,1,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1000,TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-...,9950,68,1,1999,42750,Petrol,110,1,...,0,1,0,0,0,1,0,0,0,0
996,1001,TOYOTA Corolla 1.6 16V LIFTB LINEA LUNA 4/5-Doors,9950,67,2,1999,42102,Petrol,110,1,...,1,1,0,1,0,1,1,0,0,0
997,1002,TOYOTA Corolla 1.6 LB Linea Terra 4/5-Doors,9950,63,6,1999,41586,Petrol,110,1,...,0,1,0,0,0,0,0,0,0,0
998,1003,TOYOTA Corolla 1.6 4/5-Doors,9900,64,5,1999,41200,Petrol,110,0,...,0,1,0,0,0,0,0,0,0,0


In [60]:
fig = plt.figure(figsize=(70,30))
_ = plot_tree(bestClassTree, 
                   feature_names=train_X.columns,                    
                   filled=True)
#fig.savefig('9.3RegTree.png')

In [61]:
#fine tuned tree training
#classificationSummary(train_y, bestClassTree.predict(train_X))
# Five-fold cross-validation of the full decision tree classifier
treeClassifier = DecisionTreeClassifier()

scores = cross_val_score(treeClassifier, train_X, train_y, cv=5)
print('Accuracy scores of each fold: ', [f'{acc:.3f}' for acc in scores])
print(f'Accuracy: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})')
print(f'Accuracy: {scores.mean():.3f} (+/- {scores.std():.3f})')

Accuracy scores of each fold:  ['0.075', '0.092', '0.083', '0.058', '0.083']
Accuracy: 0.078 (+/- 0.023)
Accuracy: 0.078 (+/- 0.011)




In [62]:
scores

array([0.075     , 0.09166667, 0.08333333, 0.05833333, 0.08333333])