In [89]:
import pandas as pd

c_df = pd.read_csv('./datasets/car_purchasing.csv')
c_df

Unnamed: 0,Customer Name,Customer e-mail,Country,Gender,Age,Annual Salary,Credit Card Debt,Net Worth,Car Purchase Amount
0,Martina Avila,cubilia.Curae.Phasellus@quisaccumsanconvallis.edu,USA,0,42,62812.09301,11609.380910,238961.2505,35321.45877
1,Harlan Barnes,eu.dolor@diam.co.uk,USA,0,41,66646.89292,9572.957136,530973.9078,45115.52566
2,Naomi Rodriquez,vulputate.mauris.sagittis@ametconsectetueradip...,USA,1,43,53798.55112,11160.355060,638467.1773,42925.70921
3,Jade Cunningham,malesuada@dignissim.com,USA,1,58,79370.03798,14426.164850,548599.0524,67422.36313
4,Cedric Leach,felis.ullamcorper.viverra@egetmollislectus.net,USA,1,57,59729.15130,5358.712177,560304.0671,55915.46248
...,...,...,...,...,...,...,...,...,...
495,Walter,ligula@Cumsociis.ca,USA,0,41,71942.40291,6995.902524,541670.1016,48901.44342
496,Vanna,Cum.sociis.natoque@Sedmolestie.edu,USA,1,38,56039.49793,12301.456790,360419.0988,31491.41457
497,Pearl,penatibus.et@massanonante.com,USA,1,54,68888.77805,10611.606860,764531.3203,64147.28888
498,Nell,Quisque.varius@arcuVivamussit.net,USA,1,59,49811.99062,14013.034510,337826.6382,45442.15353


In [90]:
# columns = ['Gender', 'Age']
columns = ['Gender', 'Age', 'Annual Salary','Net Worth', 'Car Purchase Amount']
pre_c_df = c_df[columns]
pre_c_df

Unnamed: 0,Gender,Age,Annual Salary,Net Worth,Car Purchase Amount
0,0,42,62812.09301,238961.2505,35321.45877
1,0,41,66646.89292,530973.9078,45115.52566
2,1,43,53798.55112,638467.1773,42925.70921
3,1,58,79370.03798,548599.0524,67422.36313
4,1,57,59729.15130,560304.0671,55915.46248
...,...,...,...,...,...
495,0,41,71942.40291,541670.1016,48901.44342
496,1,38,56039.49793,360419.0988,31491.41457
497,1,54,68888.77805,764531.3203,64147.28888
498,1,59,49811.99062,337826.6382,45442.15353


### Multivariate Linear Regression Task
- 구매자의 연수입(Annual Salary)과 순자산(자산 - 부채, Net Worth), 신용카드 부채(Credit Card Debt)를 통해 다변량 회귀 분석 진행

In [84]:
pre_c_df.corr()

Unnamed: 0,Gender,Age,Annual Salary,Credit Card Debt,Net Worth,Car Purchase Amount
Gender,1.0,-0.066488,-0.036499,0.024193,-0.008395,-0.066408
Age,-0.066488,1.0,0.000361,0.031748,0.021794,0.633273
Annual Salary,-0.036499,0.000361,1.0,0.049599,0.014767,0.617862
Credit Card Debt,0.024193,0.031748,0.049599,1.0,-0.049378,0.028882
Net Worth,-0.008395,0.021794,0.014767,-0.049378,1.0,0.48858
Car Purchase Amount,-0.066408,0.633273,0.617862,0.028882,0.48858,1.0


In [33]:
import torch
from torch.optim import SGD
from sklearn.model_selection import train_test_split

torch.manual_seed(124)

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

X_train = torch.FloatTensor(X_train.values)
y_train = torch.FloatTensor(y_train.values).view(-1, 1)

X_test = torch.FloatTensor(X_test.values)
y_test = torch.FloatTensor(y_test.values).view(-1, 1)

W = torch.zeros((4, 1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)

optimizer = SGD([W, b], lr=1e-12)

epochs = 10000

for epoch in range(1, epochs + 1):
    H = X_train.matmul(W) + b
    loss = torch.mean((y_train - H) ** 2)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 1000 == 0:
        print('{:4d}/{}: W1: {:.4f}, W2: {:.4f}, W3: {:.4f},W4: {:.4f},  b: {:.4f}, loss: {:.4f}'\
              .format(epoch, epochs, W[0].item(), W[1].item(), W[2].item(), W[3].item(), b.item(), loss.item()))

1000/10000: W1: 0.0000, W2: 0.0003, W3: 0.3705,W4: 0.0463,  b: 0.0000, loss: 58676008.0000
2000/10000: W1: 0.0000, W2: 0.0005, W3: 0.4733,W4: 0.0335,  b: 0.0000, loss: 46553344.0000
3000/10000: W1: 0.0000, W2: 0.0006, W3: 0.5027,W4: 0.0298,  b: 0.0000, loss: 45559940.0000
4000/10000: W1: 0.0000, W2: 0.0007, W3: 0.5112,W4: 0.0288,  b: 0.0000, loss: 45478532.0000
5000/10000: W1: 0.0000, W2: 0.0008, W3: 0.5136,W4: 0.0285,  b: 0.0000, loss: 45471852.0000
6000/10000: W1: 0.0000, W2: 0.0009, W3: 0.5143,W4: 0.0284,  b: 0.0000, loss: 45471292.0000
7000/10000: W1: 0.0000, W2: 0.0010, W3: 0.5145,W4: 0.0283,  b: 0.0000, loss: 45471236.0000
8000/10000: W1: 0.0000, W2: 0.0011, W3: 0.5145,W4: 0.0283,  b: 0.0000, loss: 45471220.0000
9000/10000: W1: 0.0000, W2: 0.0012, W3: 0.5145,W4: 0.0283,  b: 0.0000, loss: 45471220.0000
10000/10000: W1: -0.0000, W2: 0.0013, W3: 0.5145,W4: 0.0283,  b: 0.0000, loss: 45471212.0000


In [None]:
# H = 0.0525 * X_test1 + 0.2234 * X_test2 + 0.0228 * X_test3 +0.0228 * X_test3 + 0.0106
# loss = torch.mean((y_test - H) ** 2)
# print(loss.item())

In [62]:
from torch.nn import Module, Linear

class LinearRegressionModel(Module):
    def __init__(self):
        super().__init__()
        self.linear = Linear(1, 1)

    def forward(self, x):
        return self.linear(x)

In [63]:
import torch
from torch.nn.functional import mse_loss
from torch.optim import SGD
from sklearn.model_selection import train_test_split

torch.manual_seed(124)

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

X_train = torch.FloatTensor(X_train.values)
y_train = torch.FloatTensor(y_train.values).view(-1, 1)

X_test = torch.FloatTensor(X_test.values)
y_test = torch.FloatTensor(y_test.values).view(-1, 1)

l_r = LinearRegressionModel()
optimizer = SGD(l_r.parameters(), lr=1e-10)

epochs = 100000

for epoch in range(1, epochs + 1):
    H = l_r(X_train)
    loss = mse_loss(y_train, H)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10000 == 0:
        print('{:4d}/{}: '\
              .format(epoch, epochs, W[0].item(), W[1].item(), b.item(), loss.item()), end='')
        for i, w in enumerate(list(l_r.parameters())[0][0]):
            print('W{}: {:.4f}, '\
                  .format(i + 1, w.item()), end='')
        print('b: {:.4f}, loss: {:.4f}'.format(list(l_r.parameters())[1].item(), loss.item()))

10000/100000: W1: -0.6948, b: 0.2623, loss: 2193.6890
20000/100000: W1: -0.6948, b: 0.2623, loss: 2193.6890
30000/100000: W1: -0.6948, b: 0.2623, loss: 2193.6890
40000/100000: W1: -0.6948, b: 0.2623, loss: 2193.6890
50000/100000: W1: -0.6948, b: 0.2623, loss: 2193.6890
60000/100000: W1: -0.6948, b: 0.2623, loss: 2193.6890
70000/100000: W1: -0.6948, b: 0.2623, loss: 2193.6890
80000/100000: W1: -0.6948, b: 0.2623, loss: 2193.6890
90000/100000: W1: -0.6948, b: 0.2623, loss: 2193.6890
100000/100000: W1: -0.6948, b: 0.2623, loss: 2193.6890


In [91]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)


# for i, feature_name in enumerate(X_train.columns):
    # print('{} 피쳐의 계수: {:.4f}'.format(feature_name, l_r.coef_[i]))


# print(l_r.intercept_)

l_r = LinearRegression()
l_r.fit(X_train, y_train)
print('W: {:.4f}, b: {:.4f}'.format(l_r.coef_[0], l_r.intercept_))

W: 21.0428, b: -41953.7262


In [93]:
from sklearn.metrics import mean_squared_error

prediction = l_r.predict(X_test)
print('MSE loss: {:.4f}, RMSE loss: {:.4f}'\
      .format(mean_squared_error(y_test, prediction), 
      np.sqrt(mean_squared_error(y_test, prediction))))

MSE loss: 55372.6682, RMSE loss: 235.3140


In [83]:
# import matplotlib.pyplot as plt

# plt.rcParams['font.family'] ='Malgun Gothic'
# plt.rcParams['axes.unicode_minus'] = False

# plt.scatter(X_train, y_train)
# plt.plot(X_train, 0.0468 * X_train + 7.1863 , color="red")
# plt.grid(visible=True, linestyle='--')
# plt.show()