In [37]:
import os
import ray
import joblib
import numpy as np
import modin.pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pingouin as pg
import torch
import torch.nn as nn
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVR
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_absolute_error
from ray.util.joblib import register_ray
from matplotlib_inline.backend_inline import set_matplotlib_formats

In [2]:
env = {
    'data_path': os.path.join(os.getcwd(), 'Banking.csv'),
}

In [3]:
def config() -> None:
    ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})
    register_ray()
    sns.set_style('ticks')
    sns.set_context('notebook', rc={
        'figure.dpi': 100,
    })
    set_matplotlib_formats('retina')


config()

2023-07-02 16:36:34,795	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


In [19]:
def load_data(path: str) -> pd.DataFrame:
    return pd.read_csv(path)


df = load_data(env['data_path'])

In [20]:
df = load_data(env['data_path'])
df.drop('Loan_ID', axis=1, inplace=True)
df['Loan_Amount_Requested'] = df['Loan_Amount_Requested'].str.replace(',','').astype(int)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True)

In [40]:
# Create pipelines
def create_pipeline(features):
    num_selector = make_column_selector(dtype_include='number')
    cat_selector = make_column_selector(dtype_include='category')

    num_cols = num_selector(features)
    cat_cols = cat_selector(features)

    num_preprocessor = StandardScaler()
    cat_preprocessor = OneHotEncoder()

    preprocessor = ColumnTransformer([
        ('StandardScaler', num_preprocessor, num_cols),
        ('OneHotEncoder', cat_preprocessor, cat_cols)
    ])

    return make_pipeline(preprocessor, BaggingRegressor(LinearSVR(dual='auto')))
    

X = df.drop('Annual_Income', axis=1)
y = df['Annual_Income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
pipe = create_pipeline(X)

In [41]:
with joblib.parallel_backend('ray'):
    pipe.fit(X_train, y_train)
# pipe.fit(X_train, y_train)

pred = pipe.predict(X_train)
print(f'Mean absolute error of training set: {mean_absolute_error(y_pred=pred, y_true=y_train)}')
pred = pipe.predict(X_test)
print(f'Mean absolute error of testing set: {mean_absolute_error(y_pred=pred, y_true=y_test)}')

Mean absolute error of training set: 42551.71688268458
Mean absolute error of testing set: 42331.023349399096


In [None]:
class LinearRegressionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

class LinearRegressionTorch:
    def __init__(self, input_dim=1, output_dim=1, learning_rate=0.01, epochs=100):
        self.model = LinearRegressionModel(input_dim, output_dim)
        self.device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
        self.model = self.model.to(self.device)
        self.criterion = nn.MSELoss()
        self.learning_rate = learning_rate
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=learning_rate)
        self.epochs = epochs

    def fit(self, X_train, y_train):
        for epoch in range(self.epochs):
            epoch += 1

            inputs = torch.from_numpy(X_train).to(self.device)
            labels = torch.from_numpy(y_train).to(self.device)

            self.optimizer.zero_grad()

            outputs = self.model(inputs)

            loss = self.criterion(outputs, labels)
            loss.backward()

            self.optimizer.step()

    def predict(self, X):
        return self.model(torch.from_numpy(X).to(self.device)).data.numpy()


model = LinearRegressionTorch()
%time model.fit(X, y)