In [16]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns


In [17]:
payment_df2 = pd.read_csv('C:/Users/dines/Downloads/PS_20174392719_1491204439457_log.csv')

In [18]:
label_encoder = LabelEncoder()
payment_df2['type_encoded'] = label_encoder.fit_transform(payment_df2['type'])

In [19]:
col_select = [ 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
X = payment_df2[col_select]
y = payment_df2['isFraud'] 

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#classifier = DecisionTreeClassifier()
#classifier.fit(X_train, y_train)

In [21]:

# Define hyperparameters grid for tuning
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}


In [22]:
#fraud_probabilities = classifier.predict_proba(X_test)[:, 1]

In [23]:
#X_test['FraudRiskScore'] = fraud_probabilities

In [None]:
dt_regressor = DecisionTreeRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=dt_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
# Best parameters and score from the grid search
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Decision Tree Params: {best_params}")
print(f"Best CV (neg MSE) Score: {best_score}")


In [None]:
# Use the best estimator from the grid search to fit on training data
best_tree_model = grid_search.best_estimator_
best_tree_model.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = best_tree_model.predict(X_test)

In [None]:
# Evaluate the performance on the test set
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Test MSE: {mse}")
print(f"Test R2 Score: {r2}")

In [None]:
# Visualize the true vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.6, label='Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', lw=2)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs Predicted Values (Decision Tree Regressor)')
plt.legend()
plt.show()

# Visualize feature importance
feature_importance = best_tree_model.feature_importances_
features = X.columns
indices = np.argsort(feature_importance)

plt.figure(figsize=(10, 6))
plt.title('Feature Importance')
plt.barh(range(len(indices)), feature_importance[indices], align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()