In [1]:
import pandas as pd

df=pd.read_csv('/content/sample_data/fraud.csv', index_col = 0)
y = df['Class'].values
df = df.iloc[:,1:]
X = df.drop(columns = 'Class').values

In [3]:
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,1.176563,0.323798,0.536927,1.047002,-0.368652,-0.728586,0.084678,-0.069246,-0.266389,0.155315,...,-0.109627,-0.341365,0.057845,0.49918,0.415211,-0.581949,0.015472,0.018065,4.67,0
1,0.681109,-3.934776,-3.801827,-1.147468,-0.73554,-0.501097,1.038865,-0.626979,-2.274423,1.527782,...,0.652202,0.272684,-0.982151,0.1659,0.360251,0.195321,-0.256273,0.056501,912.0,0
2,1.140729,0.453484,0.24701,2.383132,0.343287,0.432804,0.09338,0.17331,-0.808999,0.775436,...,-0.003802,0.058556,-0.121177,-0.304215,0.645893,0.1226,-0.012115,-0.005945,1.0,0
3,-1.107073,-3.298902,-0.184092,-1.795744,2.137564,-1.684992,-2.015606,-0.007181,-0.16576,0.869659,...,0.130648,0.329445,0.927656,-0.04956,-1.892866,-0.575431,0.266573,0.414184,62.1,0
4,-0.314818,0.866839,-0.124577,-0.627638,2.651762,3.428128,0.194637,0.670674,-0.442658,0.133499,...,-0.312774,-0.799494,-0.064488,0.953062,-0.42955,0.158225,0.076943,-0.015051,2.67,0


In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                    test_size = 0.40,
                                    random_state=1)

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#--------------------------------------------------
## ------------Logistic Regresion----------------##
#--------------------------------------------------

from sklearn.linear_model import LogisticRegression

steps = [('scaler', StandardScaler()),
         ('logReg', LogisticRegression(penalty = "l1", C =1, solver='liblinear'))]

LR_pipeline = Pipeline(steps)
LR_pipeline.fit(X_train, y_train)

In [5]:
#--------------------------------------------------
## Model Evaluation ##
#--------------------------------------------------
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


ypred_test = LR_pipeline.predict(X_test)
mat_clf = confusion_matrix(y_test, ypred_test)
report_clf = classification_report(y_test, ypred_test)

print(mat_clf)
print(report_clf)

ypred_testP = LR_pipeline.predict_proba(X_test)
auc = roc_auc_score(y_test, ypred_testP[:,1])
print(auc)

[[8535    6]
 [  27  110]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8541
           1       0.95      0.80      0.87       137

    accuracy                           1.00      8678
   macro avg       0.97      0.90      0.93      8678
weighted avg       1.00      1.00      1.00      8678

0.982880344444188


In [6]:
ypred_train = LR_pipeline.predict(X_train)
mat_clf = confusion_matrix(y_train, ypred_train)
report_clf = classification_report(y_train, ypred_train)

print(mat_clf)
print(report_clf)

ypred_trainP = LR_pipeline.predict_proba(X_train)
auc = roc_auc_score(y_train, ypred_trainP[:,1])
print(auc)

[[12793     3]
 [   46   173]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12796
           1       0.98      0.79      0.88       219

    accuracy                           1.00     13015
   macro avg       0.99      0.89      0.94     13015
weighted avg       1.00      1.00      1.00     13015

0.9796711586526041


#  Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression
import numpy as np
X_amount = df.drop(columns=['Class', 'Amount']).values
y_amount = df['Amount'].values

In [9]:
X_train_amt, X_test_amt, y_train_amt, y_test_amt = train_test_split(X_amount, y_amount, test_size=0.4, random_state=1)

In [10]:
linear_steps = [('scaler', StandardScaler()), ('linear', LinearRegression())]

In [11]:
linear_pipeline = Pipeline(linear_steps)

In [12]:
linear_pipeline.fit(X_train_amt, y_train_amt)

In [13]:
y_pred_linear = linear_pipeline.predict(X_test_amt)

In [14]:
X_test_amt

array([[-0.40622598,  0.97550636,  1.13722513, ...,  0.106631  ,
         0.2537768 ,  0.08003874],
       [-1.47786101, -0.04766579,  2.59275043, ...,  0.53417522,
         0.17618306, -0.05558966],
       [ 1.25510238, -0.59158358,  0.4615481 , ...,  0.34167155,
        -0.02035939,  0.00551909],
       ...,
       [-2.93688289,  3.14111389, -2.53056963, ..., -0.38776471,
        -0.41038818,  0.00610202],
       [-0.01002683,  0.68553816, -0.53059335, ..., -0.33403481,
         0.0761074 ,  0.15946343],
       [ 1.98995027, -0.19281301, -0.10669695, ...,  0.23824931,
        -0.05211263, -0.0471446 ]])

# Polynomial Regression Section

In [17]:
from sklearn.preprocessing import PolynomialFeatures
X_poly_simple = X_amount[:1000, :2]
poly_features = PolynomialFeatures(degree=2, include_bias=False)

In [18]:
X_poly_transformed = poly_features.fit_transform(X_poly_simple)

# Model Evaluation

In [19]:
from sklearn.metrics import mean_squared_error, r2_score

In [20]:
linear_mse = mean_squared_error(y_test_amt, y_pred_linear)

In [21]:
linear_r2 = r2_score(y_test_amt, y_pred_linear)
print(f"Linear Regression MSE: {linear_mse:.2f}")

Linear Regression MSE: 9444.49


In [22]:
print(f"Linear Regression R²: {linear_r2:.4f}")

Linear Regression R²: 0.8495


In [23]:
poly_pipeline = Pipeline([('scaler', StandardScaler()), ('poly', PolynomialFeatures(degree=2, include_bias=False)), ('regressor', LinearRegression())])

In [25]:
poly_pipeline.fit(X_train_amt, y_train_amt)
y_pred_poly = poly_pipeline.predict(X_test_amt)
poly_mse = mean_squared_error(y_test_amt, y_pred_poly)

In [27]:
poly_r2 = r2_score(y_test_amt, y_pred_poly)
print(f"Polynomial Regression MSE: {poly_mse:.2f}")

Polynomial Regression MSE: 11933.45


In [28]:
print(f"Polynomial Regression R²: {poly_r2:.4f}")

Polynomial Regression R²: 0.8098


In [29]:
regression_comparison = pd.DataFrame({'Model': ['Linear Regression', 'Polynomial Regression'], 'MSE': [linear_mse, poly_mse], 'R²': [linear_r2, poly_r2]})

In [30]:
print(regression_comparison.round(4))

                   Model         MSE      R²
0      Linear Regression   9444.4850  0.8495
1  Polynomial Regression  11933.4511  0.8098


In [31]:
print(f"Actual classes: {y_test[:5]}")

Actual classes: [0 0 0 0 0]


In [32]:
print(f"Predicted classes: {ypred_test[:5]}")

Predicted classes: [0 0 0 0 0]


In [33]:
print(f"Predicted probabilities for class 1: {ypred_testP[:5, 1]}")

Predicted probabilities for class 1: [0.0014161  0.0003633  0.00091741 0.00563375 0.00080894]


In [40]:

linear_mse = 9444.49
linear_r2 = 0.8495
poly_mse = 11933.45
poly_r2 = 0.8098
logistic_auc = 0.982880344444188
print("-" * 40)
print(f"Linear Regression    - MSE: {linear_mse:,.2f}, R²: {linear_r2:.4f}")
print(f"Polynomial Regression - MSE: {poly_mse:,.2f}, R²: {poly_r2:.4f}")
print(f"Logistic Regression  - AUC: {logistic_auc:.4f}")
print()

----------------------------------------
Linear Regression    - MSE: 9,444.49, R²: 0.8495
Polynomial Regression - MSE: 11,933.45, R²: 0.8098
Logistic Regression  - AUC: 0.9829

