In [145]:
import numpy as np
import pandas as pd
import matplotlib as mp
import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import IV2SLS 
from statsmodels.sandbox.regression.gmm import GMM

## Part One

In [146]:
# Load the dataset
input_table = pd.read_csv('/Users/liu/Desktop/5110 midterm/midterm_partone.csv')
input_table.head()

Unnamed: 0,Constant,Stock Change,Inventory Turnover,Operating Profit,Interaction Effect,Current Ratio,Quick Ratio,Debt Asset Ratio
0,1,0.870332,1.795946,0.115846,0.208053,1.672527,0.255171,0.473317
1,1,-0.047347,1.395501,0.436967,0.609788,1.637261,0.221763,0.489967
2,1,0.001176,1.664563,0.541016,0.900555,1.640619,0.189141,0.374269
3,1,-0.9012,1.605738,0.539399,0.866133,1.436221,0.131944,0.224399
4,1,-0.176353,1.591451,0.539938,0.859285,1.43314,0.183095,0.213446


In [147]:
# Fit an OLS model for instrument endogenous variable
model_iv = sm.OLS(input_table["Inventory Turnover"], input_table[["Constant", "Current Ratio", "Quick Ratio", "Debt Asset Ratio"]]).fit()
endog_predict = model_iv.predict(input_table[["Constant", "Current Ratio", "Quick Ratio", "Debt Asset Ratio"]])
input_table["Endogenous Param"] = endog_predict

In [148]:
# Fit the 2SLS model
model_2sls = sm.OLS(input_table["Stock Change"], input_table[["Constant", "Endogenous Param", "Operating Profit", "Interaction Effect"]]).fit()
model_2sls.summary()

0,1,2,3
Dep. Variable:,Stock Change,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.013
Method:,Least Squares,F-statistic:,8.53
Date:,"Thu, 09 Nov 2023",Prob (F-statistic):,1.27e-05
Time:,15:22:49,Log-Likelihood:,-1186.5
No. Observations:,1696,AIC:,2381.0
Df Residuals:,1692,BIC:,2403.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Constant,-0.0176,0.020,-0.896,0.370,-0.056,0.021
Endogenous Param,0.0011,0.001,1.827,0.068,-7.76e-05,0.002
Operating Profit,-0.1201,0.028,-4.319,0.000,-0.175,-0.066
Interaction Effect,0.0014,0.000,3.621,0.000,0.001,0.002

0,1,2,3
Omnibus:,368.832,Durbin-Watson:,2.243
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3433.92
Skew:,0.742,Prob(JB):,0.0
Kurtosis:,9.811,Cond. No.,109.0


In [149]:
# Extract data
y_vals = np.array(input_table["Stock Change"])
x_vals = np.array(input_table[["Inventory Turnover", "Operating Profit", "Interaction Effect"]])
iv_vals = np.array(input_table[["Current Ratio", "Quick Ratio", "Debt Asset Ratio"]])

In [150]:
# Define an updated GMM class
class gmm_updated(GMM):
    def momcond(self, params):
        p0, p1, p2, p3, delta = params
        endog = self.endog
        exog = self.exog
        inst = self.instrument

        error0 = endog - p0 - p1 * exog[:, 0] - p2 * exog[:, 1] - p3 * exog[:, 2]
        error1 = (endog - p0 - p1 * exog[:, 0] - p2 * exog[:, 1] - p3 * exog[:, 2]) * exog[:, 1]
        error2 = (endog - p0 - p1 * exog[:, 0] - p2 * exog[:, 1] - p3 * exog[:, 2]) * exog[:, 2]
        error3 = (endog - p0 - p1 * exog[:, 0] - p2 * exog[:, 1] - p3 * exog[:, 2]) * inst[:, 0]- delta
        error4 = (endog - p0 - p1 * exog[:, 0] - p2 * exog[:, 1] - p3 * exog[:, 2]) * inst[:, 1]- delta
        error5 = (endog - p0 - p1 * exog[:, 0] - p2 * exog[:, 1] - p3 * exog[:, 2]) * inst[:, 2]- delta

        g = np.column_stack((error0, error1, error2, error3, error4, error5))
        return g                     

In [151]:
# Initialize parameters including delta
beta0_updated = np.array([0.1, 0.1, 0.1, 0.1, 0.1])
# Fit the updated GMM model
res_updated = gmm_updated(endog=y_vals, exog=x_vals, instrument=iv_vals, k_moms=6, k_params=5).fit(beta0_updated)


Optimization terminated successfully.
         Current function value: 0.000031
         Iterations: 10
         Function evaluations: 15
         Gradient evaluations: 15
Optimization terminated successfully.
         Current function value: 0.000345
         Iterations: 9
         Function evaluations: 11
         Gradient evaluations: 11
Optimization terminated successfully.
         Current function value: 0.000346
         Iterations: 7
         Function evaluations: 10
         Gradient evaluations: 10
Optimization terminated successfully.
         Current function value: 0.000346
         Iterations: 2
         Function evaluations: 5
         Gradient evaluations: 5


In [152]:
# Print the summary table
print(res_updated.summary())

                             gmm_updated Results                              
Dep. Variable:                      y   Hansen J:                       0.5862
Model:                    gmm_updated   Prob (Hansen J):                 0.444
Method:                           GMM                                         
Date:                Thu, 09 Nov 2023                                         
Time:                        15:22:49                                         
No. Observations:                1696                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
p 0           -0.0208      0.021     -0.986      0.324      -0.062       0.020
p 1            0.0011      0.001      1.839      0.066   -7.31e-05       0.002
p 2           -0.1062      0.032     -3.316      0.001      -0.169      -0.043
p 3            0.0011      0.000      2.688      0.0

## Part Two

In [153]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report

In [154]:
# Load the dataset
df = pd.read_csv('/Users/liu/Desktop/5110 midterm/midterm_parttwo.csv')

In [155]:
df.head()

Unnamed: 0,Years of Education after High School,Requested Credit Amount,Number of Dependents,Monthly Income,Monthly Expense,Marital Status,Credit Rating
0,1,Low,No dependent,Very low,Very low,Married,Positive
1,2,Low,No dependent,Very low,Very low,Single,Positive
2,1,Low,No dependent,Very low,Very low,Single,Positive
3,3,Low,No dependent,Very low,Very low,Married,Positive
4,3,Low,No dependent,Very low,Very low,Single,Negative


### Question1

In [156]:
# Define features and target
X = df.drop('Credit Rating', axis=1)
y = df['Credit Rating']

In [157]:
# Convert categorical variables into dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

In [158]:
# Split the dataset into training and test sets (50% each)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [159]:
# Fit a logistic regression model on the training set
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

In [160]:
# Logistic Regression feature importance (coefficients)
lr_importance = lr_model.coef_
print(f"Logistic Regression Coefficients: {lr_importance}")

Logistic Regression Coefficients: [[ 0.06941315 -0.02388211 -0.14014207  0.24709413 -0.12840459 -0.09745196
  -0.26099784  0.32373164  0.10491338  0.29118167  0.04638876 -0.91906818
   0.03917557  0.12796411  0.14508364]]


In [161]:
# Apply the model to the test set
predicted_y = lr_model.predict(X_test)

In [162]:
# Calculate confusion matrix, recall, precision, and F1 score
conf_matrix = confusion_matrix(y_test, predicted_y)
recall = recall_score(y_test, predicted_y, pos_label='Positive')
precision = precision_score(y_test, predicted_y, pos_label='Positive')
f1 = f1_score(y_test, predicted_y, pos_label='Positive')

# Print the results
print("Confusion Matrix:")
print(conf_matrix)
print("\nRecall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

Confusion Matrix:
[[   0  577]
 [   0 3464]]

Recall: 1.0
Precision: 0.8572135609997525
F1 Score: 0.9231179213857428


### Question 2

In [163]:
# Adjust the credit approval threshold to approve only 15% of applications
threshold = np.percentile(lr_model.predict_proba(X_test)[:, 1], 85)

In [164]:
# Update predictions with the adjusted threshold
y_pred_adjusted = (lr_model.predict_proba(X_test)[:, 1] >= threshold).astype('str')

In [165]:
# Calculate the confusion matrix, recall, precision, and F1 score with the adjusted threshold
conf_matrix_adjusted = confusion_matrix(y_test, y_pred_adjusted)
recall_adjusted = recall_score(y_test, y_pred_adjusted, average='weighted')
precision_adjusted = precision_score(y_test, y_pred_adjusted, average='weighted')
f1_score_adjusted = f1_score(y_test, y_pred_adjusted, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [166]:
print("\nResults with 15% Approval Rate:")
print("Confusion Matrix (Adjusted):\n", conf_matrix_adjusted)
print("Recall (Adjusted):", recall_adjusted)
print("Precision (Adjusted):", precision_adjusted)
print("F1 Score (Adjusted):", f1_score_adjusted)


Results with 15% Approval Rate:
Confusion Matrix (Adjusted):
 [[   0    0    0    0]
 [ 495    0    0   82]
 [2936    0    0  528]
 [   0    0    0    0]]
Recall (Adjusted): 0.0
Precision (Adjusted): 0.0
F1 Score (Adjusted): 0.0


In [167]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder

In [168]:
# Assuming 'Negative' is class 0 and 'Positive' is class 1
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)

In [169]:
# Apply class-specific threshold to the predicted probabilities
custom_predicted_target = (lr_model.predict_proba(X_test)[:, 1] > class_thresholds[0]).astype(int)


In [170]:
# Update class-specific confusion matrix, recall, precision, and F1 score
class_custom_metrics = {}
class_custom_conf_matrix = confusion_matrix(y_test_encoded, custom_predicted_target)
class_custom_precision, class_custom_recall, class_custom_f1, _ = precision_recall_fscore_support(
    y_test_encoded, custom_predicted_target, average='weighted', zero_division=1)

class_custom_metrics[0] = {
    'Confusion Matrix': class_custom_conf_matrix,
    'Recall': class_custom_recall,
    'Precision': class_custom_precision,
    'F1 Score': class_custom_f1
}

In [171]:
# Print class-specific results with the custom threshold
for i, metrics in class_custom_metrics.items():
    print(f"\nResults for Class {i} with {15}% Approval Rate:")
    print("Confusion Matrix (Adjusted):")
    print(metrics['Confusion Matrix'])
    print(f"Recall (Adjusted): {metrics['Recall']}")
    print(f"Precision (Adjusted): {metrics['Precision']}")
    print(f"F1 Score (Adjusted): {metrics['F1 Score']}")


Results for Class 0 with 15% Approval Rate:
Confusion Matrix (Adjusted):
[[   0  577]
 [   0 3464]]
Recall (Adjusted): 0.8572135609997525
Precision (Adjusted): 0.877601528162124
F1 Score (Adjusted): 0.7913092006137622
