In [32]:
#Question 9.3ab
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, mean_absolute_error
import matplotlib.pylab as plt
from dmba import plotDecisionTree, classificationSummary, regressionSummary

car = pd.read_csv('ToyotaCorolla.csv')

categorical_cols = car.select_dtypes(include=['object']).columns
categorical_cols = categorical_cols.drop('Price', errors='ignore')

car_df = pd.get_dummies(car, columns=categorical_cols, drop_first=True)

predictors = [col for col in car_df.columns if col not in ['Price', 'Binned_Price']]
outcome = 'Price'

car_df = car_df.dropna()

train_df, valid_df = train_test_split(car_df, test_size=0.4, random_state=1)

print('Part A(i)') #Part A(i)
X_train = train_df[predictors]
y_train = train_df[outcome]
X_valid = valid_df[predictors]
y_valid = valid_df[outcome]

full_tree = DecisionTreeRegressor(random_state=1)
full_tree.fit(X_train, y_train)

train_pred = full_tree.predict(X_train)
valid_pred = full_tree.predict(X_valid)

print("Full Tree - Training Set Performance:")
regressionSummary(y_train, train_pred)
print("Full Tree - Validation Set Performance:")
regressionSummary(y_valid, valid_pred)

print("-----")
print('Part A(ii)') #Part A(ii)
train_residuals = y_train - train_pred
valid_residuals = y_valid - valid_pred

plt.figure(figsize=(8, 6))
plt.boxplot([train_residuals, valid_residuals], labels=['Training', 'Validation'])
plt.title('Residuals: Training vs. Validation Sets')
plt.ylabel('Residuals')
plt.grid(True)
plt.tight_layout()
plt.savefig('Q9_residuals_boxplot.jpg')
plt.close()

importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': full_tree.feature_importances_
})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Top 4 important features:")
print(importance_df.head(4))

plt.figure(figsize=(10, 6))
importance_df_sorted = importance_df.sort_values(by='Importance', ascending=True)
plt.barh(importance_df_sorted['Feature'], importance_df_sorted['Importance'])
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.tight_layout()
plt.savefig('Q9_feature_importances.jpg')
plt.close()

plt.figure(figsize=(20, 15))
plot_tree(full_tree, feature_names=X_train.columns, filled=True, rounded=True, fontsize=10)
plt.savefig('Q9_full_decision_tree.jpg', dpi=300)
plt.close()


print(f"Tree depth: {full_tree.get_depth()}")
print(f"Number of leaves: {full_tree.get_n_leaves()}")

print("-----")
print('Part A(iv)') #Part A(iv)
param_grid = {
    'max_depth': range(2, 10), 
    'min_samples_split': range(5, 15)
}

grid_search = GridSearchCV(DecisionTreeRegressor(random_state=1), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters from GridSearchCV: ", grid_search.best_params_)

best_tree = grid_search.best_estimator_
train_pred_best = best_tree.predict(X_train)
valid_pred_best = best_tree.predict(X_valid)

print("Best Tree - Training Set Performance:")
regressionSummary(y_train, train_pred_best)
print("Best Tree - Validation Set Performance:")
regressionSummary(y_valid, valid_pred_best)

plt.figure(figsize=(20, 15))
plot_tree(best_tree, feature_names=X_train.columns, filled=True, rounded=True, fontsize=10)
plt.savefig('Q9_best_decision_tree.jpg', dpi=300)
plt.close()

print(f"Tree depth: {best_tree.get_depth()}")
print(f"Number of leaves: {best_tree.get_n_leaves()}")

print("-----")
#Part B(i)
car_df['Binned_Price'], bin_edges = pd.cut(car_df['Price'], bins=20, labels=False, retbins=True)

X_train_clf, X_valid_clf, y_train_binned, y_valid_binned = train_test_split(
    car_df[predictors], car_df['Binned_Price'], test_size=0.4, random_state=1
)
print("-----")
print('Part B(ii)') #Part B(ii)
classification_tree = DecisionTreeClassifier(random_state=1)
classification_tree.fit(X_train_clf, y_train_binned)

train_pred_ct = classification_tree.predict(X_train_clf)
valid_pred_ct = classification_tree.predict(X_valid_clf)

print("Classification Tree - Training Set Performance:")
classificationSummary(y_train_binned, train_pred_ct)
print("Classification Tree - Validation Set Performance:")
classificationSummary(y_valid_binned, valid_pred_ct)

print("-----")
print('Part B(iii)') #Part B(iii)
param_grid_clf = {
    'max_depth': range(2, 10), 
    'min_samples_split': range(5, 15)
}

grid_search_ct = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid_clf, cv=5)
grid_search_ct.fit(X_train_clf, y_train_binned)

best_ct = grid_search_ct.best_estimator_

train_pred_best_ct = best_ct.predict(X_train_clf)
valid_pred_best_ct = best_ct.predict(X_valid_clf)

print("Fine-tuned Classification Tree - Training Set Performance:")
classificationSummary(y_train_binned, train_pred_best_ct)
print("Fine-tuned Classification Tree - Validation Set Performance:")
classificationSummary(y_valid_binned, valid_pred_best_ct)

plt.figure(figsize=(20, 15))
plot_tree(best_ct, feature_names=X_train_clf.columns, filled=True, rounded=True, fontsize=10)
plt.savefig('Q9_best_classification_tree.jpg', dpi=300)
plt.close()

car_specifications = pd.DataFrame(columns=predictors)
car_specifications.loc[0] = 0

car_specifications.loc[0, 'Age_08_04'] = 77
car_specifications.loc[0, 'KM'] = 117000
car_specifications.loc[0, 'HP'] = 110
car_specifications.loc[0, 'Automatic'] = 0
car_specifications.loc[0, 'Doors'] = 5
car_specifications.loc[0, 'Quarterly_Tax'] = 100
car_specifications.loc[0, 'Mfr_Guarantee'] = 0
car_specifications.loc[0, 'Guarantee_Period'] = 3
car_specifications.loc[0, 'Airco'] = 1
car_specifications.loc[0, 'Automatic_airco'] = 0
car_specifications.loc[0, 'CD_Player'] = 0
car_specifications.loc[0, 'Powered_Windows'] = 0
car_specifications.loc[0, 'Sport_Model'] = 0
car_specifications.loc[0, 'Tow_Bar'] = 1
car_specifications.loc[0, 'Fuel_Type_Petrol'] = 1

car_specifications = car_specifications.fillna(0)

predicted_price_rt = best_tree.predict(car_specifications)
print("\nPredicted Price from Regression Tree:", predicted_price_rt[0])

predicted_bin_ct = best_ct.predict(car_specifications)
print("Predicted Price Bin from Classification Tree:", predicted_bin_ct[0])

bin_start = bin_edges[int(predicted_bin_ct[0])]
bin_end = bin_edges[int(predicted_bin_ct[0]) + 1]
print(f"Predicted Price Range from Classification Tree: {bin_start:.2f} - {bin_end:.2f}")

Part A(i)
Full Tree - Training Set Performance:

Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 0.0000
            Mean Absolute Error (MAE) : 0.0000
          Mean Percentage Error (MPE) : 0.0000
Mean Absolute Percentage Error (MAPE) : 0.0000
Full Tree - Validation Set Performance:

Regression statistics

                      Mean Error (ME) : -19.3078
       Root Mean Squared Error (RMSE) : 1500.0279
            Mean Absolute Error (MAE) : 1107.2035
          Mean Percentage Error (MPE) : -1.1716
Mean Absolute Percentage Error (MAPE) : 11.1827
-----
Part A(ii)
Top 4 important features:
     Feature  Importance
1  Age_08_04    0.665403
0         Id    0.139642
5         HP    0.048441
3   Mfg_Year    0.036904
Tree depth: 27
Number of leaves: 717
-----
Part A(iv)
Best parameters from GridSearchCV:  {'max_depth': 7, 'min_samples_split': 8}
Best Tree - Training Set Performance:

Regression statistics

                      M



Fine-tuned Classification Tree - Training Set Performance:
Confusion Matrix (Accuracy 0.6272)

       Prediction
Actual   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
     0   1   1   0   1   0   1   0   0   0   0   0   0   0   0   0   0   0
     1   0  40  14   8   0   0   0   0   0   0   0   0   0   0   0   0   0
     2   1  19 107  52   0   0   0   0   0   0   0   0   0   0   0   0   0
     3   0   2  26 187  31   4   1   0   0   0   0   0   0   0   0   0   0
     4   0   0   3  31  61  10   4   0   0   0   0   0   0   0   0   0   0
     5   0   0   0   5  19  45  14   0   0   0   0   0   0   0   0   0   0
     6   0   0   0   1   3  12  42   0   0   0   0   0   0   0   0   0   0
     7   0   0   0   0   0   1   7   3   2   0   1   0   0   0   0   0   0
     8   0   0   0   0   0   0   1   0  26   0   4   1   0   0   0   0   0
     9   0   0   0   0   0   0   0   0   2   5   2   4   0   0   0   0   0
    10   0   0   0   0   1   0   0   0   1   0  13   8   0   0

In [46]:
# Question 10.4
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_curve
import statsmodels.api as sm
from dmba import classificationSummary, gainsChart, liftChart
from dmba.metric import AIC_score
import matplotlib.pylab as plt

ebay_data = pd.read_csv('eBayAuctions.csv')

print('Part A') #Part A
category_pivot = ebay_data.pivot_table(values='Competitive?', index='Category', aggfunc='mean')
currency_pivot = ebay_data.pivot_table(values='Competitive?', index='currency', aggfunc='mean')
endday_pivot = ebay_data.pivot_table(values='Competitive?', index='endDay', aggfunc='mean')
duration_pivot = ebay_data.pivot_table(values='Competitive?', index='Duration', aggfunc='mean')

print("Category Pivot Table:\n", category_pivot)
print("\nCurrency Pivot Table:\n", currency_pivot)
print("\nEndDay Pivot Table:\n", endday_pivot)
print("\nDuration Pivot Table:\n", duration_pivot)

ebay_data_dummies = pd.get_dummies(ebay_data, columns=['Category', 'currency', 'endDay', 'Duration'], drop_first=True)

X = ebay_data_dummies.drop(columns=['Competitive?'])
y = ebay_data_dummies['Competitive?']

print("-----")
print('Part B') #Part B
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

logistic_model = LogisticRegression(max_iter=1000, solver='liblinear')
logistic_model.fit(X_train, y_train)

y_pred_prob = logistic_model.predict_proba(X_valid)[:, 1]
y_pred = (y_pred_prob >= 0.5).astype(int)

print("\nClassification Summary with All Predictors:")
classificationSummary(y_valid, y_pred)

print("-----")
print('Part C') #Part C
X_without_price = X.drop(columns=['ClosePrice'])
X_train_no_price, X_valid_no_price, y_train_no_price, y_valid_no_price = train_test_split(
    X_without_price, y, test_size=0.4, random_state=42, stratify=y
)

logistic_model_no_price = LogisticRegression(max_iter=1000, solver='liblinear')
logistic_model_no_price.fit(X_train_no_price, y_train_no_price)

y_pred_prob_no_price = logistic_model_no_price.predict_proba(X_valid_no_price)[:, 1]
y_pred_no_price = (y_pred_prob_no_price >= 0.5).astype(int)

print("\nClassification Summary without 'ClosePrice':")
classificationSummary(y_valid_no_price, y_pred_no_price)

print("-----")
print('Part D') #Part D
X_train_with_const = sm.add_constant(X_train)

bool_columns = X_train_with_const.select_dtypes(include=['bool']).columns
X_train_with_const[bool_columns] = X_train_with_const[bool_columns].astype(int)

print("\nData types after converting 'bool' to 'int':")
print(X_train_with_const.dtypes)

print("\nData type of y_train before conversion:", y_train.dtype)
if y_train.dtype == 'bool':
    y_train = y_train.astype(int)
print("Data type of y_train after conversion:", y_train.dtype)

logit_model = sm.Logit(y_train, X_train_with_const)
result = logit_model.fit()

closing_price_coef = result.params['ClosePrice']
closing_price_pvalue = result.pvalues['ClosePrice']

print(f"\nClosing Price Coefficient: {closing_price_coef:.4f}, P-value: {closing_price_pvalue:.4e}")

print("-----")
print('Part E') #Part E
rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000, solver='liblinear'), n_features_to_select=5)
rfe_selector.fit(X_train, y_train)
selected_features_train = X_train.columns[rfe_selector.support_]

print("\nSelected Predictors from Stepwise Regression on Training Data:")
print(selected_features_train)

logistic_model_rfe = LogisticRegression(max_iter=1000, solver='liblinear')
logistic_model_rfe.fit(X_train[selected_features_train], y_train)

y_pred_prob_rfe = logistic_model_rfe.predict_proba(X_valid[selected_features_train])[:, 1]
y_pred_rfe = (y_pred_prob_rfe >= 0.5).astype(int)

print("\nClassification Summary with Selected Features (Training Data):")
classificationSummary(y_valid, y_pred_rfe)

print("-----")
print('Part F') #Part F
rfe_selector_valid = RFE(estimator=LogisticRegression(max_iter=1000, solver='liblinear'), n_features_to_select=5)
rfe_selector_valid.fit(X_valid, y_valid)
selected_features_valid = X_valid.columns[rfe_selector_valid.support_]
print("\nSelected Predictors from Stepwise Regression on Validation Data:")
print(selected_features_valid)

print("-----")
print('Part I') #Part I
logistic_l1 = LogisticRegressionCV(
    Cs=10, cv=5, penalty='l1', solver='liblinear', max_iter=1000, random_state=42
)
logistic_l1.fit(X_train, y_train)

y_pred_prob_l1 = logistic_l1.predict_proba(X_valid)[:, 1]
y_pred_l1 = (y_pred_prob_l1 >= 0.5).astype(int)

print("\nClassification Summary with L1 Regularization:")
classificationSummary(y_valid, y_pred_l1)

selected_features_l1 = X_train.columns[logistic_l1.coef_[0] != 0]
print("\nSelected Features from L1 Regularization:")
print(selected_features_l1)

print("-----")
print('Part J') #Part J
fpr, tpr, thresholds = roc_curve(y_valid, y_pred_prob)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"\nOptimal Cutoff Value: {optimal_threshold:.4f}")

y_pred_optimal = (y_pred_prob >= optimal_threshold).astype(int)
print("\nClassification Summary with Optimal Cutoff:")
classificationSummary(y_valid, y_pred_optimal)

#Part K
def plot_gains(y_true, y_pred_prob):
    df = pd.DataFrame({'y_true': y_true.values, 'y_pred_prob': y_pred_prob})
    df = df.sort_values(by='y_pred_prob', ascending=False).reset_index(drop=True)
    df['cum_positive'] = df['y_true'].cumsum()
    total_positives = df['y_true'].sum()
    df['gain'] = df['cum_positive'] / total_positives
    df['percentage'] = (df.index + 1) / len(df)
    plt.figure(figsize=(8, 6))
    plt.plot(df['percentage'], df['gain'], label='Model', color='blue')
    plt.plot([0, 1], [0, 1], linestyle='--', label='Random', color='grey')
    plt.xlabel('Percentage of Sample')
    plt.ylabel('Percentage of Positive Responses')
    plt.title('Gains Chart')
    plt.legend()
    plt.grid(True)
    plt.savefig('Q10_Gains_Chart.jpg', dpi=300)
    plt.close()

def plot_lift(y_true, y_pred_prob, n_bins=10):
    df = pd.DataFrame({'y_true': y_true.values, 'y_pred_prob': y_pred_prob})
    df['decile'] = pd.qcut(df['y_pred_prob'], q=n_bins, duplicates='drop', labels=False)
    overall_response_rate = df['y_true'].mean()
    lift_df = df.groupby('decile').apply(
        lambda x: x['y_true'].mean() / overall_response_rate
    ).reset_index(name='lift')
    lift_df = lift_df.sort_values(by='decile', ascending=False).reset_index(drop=True)
    lift_df['decile'] = lift_df.index + 1
    plt.figure(figsize=(8, 6))
    plt.plot(lift_df['decile'], lift_df['lift'], marker='o', linestyle='-')
    plt.xlabel('Deciles (1 = Highest Probability)')
    plt.ylabel('Lift')
    plt.title('Lift Chart')
    plt.xticks(lift_df['decile'])
    plt.grid(True)
    plt.savefig('Q10_Lift_Chart.jpg', dpi=300)
    plt.close()

plot_gains(y_valid, y_pred_prob)
plot_lift(y_valid, y_pred_prob)

Part A
Category Pivot Table:
                       Competitive?
Category                          
Antique/Art/Craft         0.564972
Automotive                0.353933
Books                     0.500000
Business/Industrial       0.666667
Clothing/Accessories      0.504202
Coins/Stamps              0.297297
Collectibles              0.577406
Computer                  0.666667
Electronics               0.800000
EverythingElse            0.235294
Health/Beauty             0.171875
Home/Garden               0.656863
Jewelry                   0.365854
Music/Movie/Game          0.602978
Photography               0.846154
Pottery/Glass             0.350000
SportingGoods             0.725806
Toys/Hobbies              0.529915

Currency Pivot Table:
           Competitive?
currency              
EUR           0.551595
GBP           0.687075
US            0.519350

EndDay Pivot Table:
         Competitive?
endDay              
Fri         0.466899
Mon         0.673358
Sat         0.427350
Sun 

  lift_df = df.groupby('decile').apply(


In [65]:
#Question 11.1
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler 
from dmba import classificationSummary

data = {
    'Years': [4, 18, 1, 3, 15, 6],
    'Salary': [43, 65, 53, 95, 88, 112],
    'Used Credit': [0, 1, 0, 0, 1, 1]
}
df = pd.DataFrame(data)

scaler = MinMaxScaler()
X = df[['Years', 'Salary']]
X_scaled = scaler.fit_transform(X)
y = df['Used Credit']

clf = MLPClassifier(hidden_layer_sizes=(3), activation='logistic', 
    solver='lbfgs', max_iter=1, random_state=1)
clf.fit(X_scaled, y)

print('Initial Weights:')
print(clf.coefs_)
print()
print('Initial Intercepts:')
print(clf.intercepts_)

predictions = clf.predict(X_scaled)
print('\nPredictions:', predictions)

probabilities = clf.predict_proba(X_scaled)
print('\nProbabilities:')
print(probabilities)

Initial Weights:
[array([[-0.09766229,  0.26219066, -0.6018463 ],
       [-0.24500208, -0.45806961, -0.49489149]]), array([[0.12627914],
       [0.00429165],
       [0.2564571 ]])]

Initial Intercepts:
[array([-0.39379937, -0.20284179, -0.12227882]), array([-0.14535905])]

Predictions: [1 0 1 0 0 0]

Probabilities:
[[0.49484899 0.50515101]
 [0.50564764 0.49435236]
 [0.49447033 0.50552967]
 [0.50146065 0.49853935]
 [0.50693581 0.49306419]
 [0.50529471 0.49470529]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
