# Linear regression using pytorch
# Agenda
1. ANN with pytorch
2. feature engineering
3. categorical features -- embedding layers, continous variables 
4. pythonic class to create feed forward neural network - linear regression

In [1]:
import pandas as pd

In [16]:
data=pd.read_csv('datasets/houseprice.csv', usecols=['SalePrice','MSSubClass','MSZoning','LotFrontage','LotArea','Street','YearBuilt','LotShape','1stFlrSF','2ndFlrSF']).dropna()

In [18]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000


In [14]:
data.shape

(1201, 9)

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MSSubClass   1201 non-null   int64  
 1   MSZoning     1201 non-null   object 
 2   LotFrontage  1201 non-null   float64
 3   LotArea      1201 non-null   int64  
 4   Street       1201 non-null   object 
 5   LotShape     1201 non-null   object 
 6   YearBuilt    1201 non-null   int64  
 7   1stFlrSF     1201 non-null   int64  
 8   2ndFlrSF     1201 non-null   int64  
 9   SalePrice    1201 non-null   int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 103.2+ KB


# pytorch for tabular dataset
1. categorical features -- embedding Layers 
2. Continous features 

In [27]:
for i in data.columns:
    print(f'column name {i} and unique value are {len(data[i].unique())}')

column name MSSubClass and unique value are 15
column name MSZoning and unique value are 5
column name LotFrontage and unique value are 110
column name LotArea and unique value are 869
column name Street and unique value are 2
column name LotShape and unique value are 4
column name YearBuilt and unique value are 112
column name 1stFlrSF and unique value are 678
column name 2ndFlrSF and unique value are 368
column name SalePrice and unique value are 597


### identify the categorical features from above

In [34]:
import datetime

In [36]:
datetime.datetime.now().year

2024

In [42]:
data['YearBuilt']

0       2003
1       1976
2       2001
3       1915
4       2000
        ... 
1455    1999
1456    1978
1457    1941
1458    1950
1459    1965
Name: YearBuilt, Length: 1201, dtype: int64

In [44]:
data['TotalYears']=datetime.datetime.now().year-data['YearBuilt']

In [46]:
data.drop('YearBuilt',axis=1,inplace=True)

In [52]:
data.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', '1stFlrSF', '2ndFlrSF', 'SalePrice', 'TotalYears'],
      dtype='object')

In [85]:
categorical_features=['MSSubClass','MSZoning','Street','LotShape']
out_feature='SalePrice'

## Categorical Features -- Embedding Layers
1. Label Encoding 
2. take all categorical features ---> numpy->torch-> tensor
3. lets take all the continuous values
4. embedding layer -- categorical features

In [87]:
from sklearn.preprocessing import LabelEncoder
lbl_encoder={}
lbl_encoder['MSSubClass']=LabelEncoder()
lbl_encoder['MSSubClass'].fit_transform(data['MSSubClass'])

array([5, 0, 5, ..., 6, 0, 0], dtype=int64)

In [58]:
data['MSSubClass'].unique()

array([ 60,  20,  70,  50, 190,  45,  90, 120,  30,  80, 160,  75, 180,
        40,  85], dtype=int64)

In [70]:
from sklearn.preprocessing import LabelEncoder
lbl_encoder={}
for feature in categorical_features:
    lbl_encoder[feature]=LabelEncoder()
    data[feature]=lbl_encoder[feature].fit_transform(data[feature])

In [72]:
data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,1stFlrSF,2ndFlrSF,SalePrice,TotalYears
0,5,3,65.0,8450,1,3,856,854,208500,21
1,0,3,80.0,9600,1,3,1262,0,181500,48
2,5,3,68.0,11250,1,0,920,866,223500,23
3,6,3,60.0,9550,1,0,961,756,140000,109
4,5,3,84.0,14260,1,0,1145,1053,250000,24
...,...,...,...,...,...,...,...,...,...,...
1455,5,3,62.0,7917,1,3,953,694,175000,25
1456,0,3,85.0,13175,1,3,2073,0,210000,46
1457,6,3,66.0,9042,1,3,1188,1152,266500,83
1458,0,3,68.0,9717,1,3,1078,0,142125,74


In [76]:
import numpy as np
cat_features=np.stack([data['MSSubClass'],data['MSZoning'], data['Street'], data['LotShape']],1)
cat_features

array([[5, 3, 1, 3],
       [0, 3, 1, 3],
       [5, 3, 1, 0],
       ...,
       [6, 3, 1, 3],
       [0, 3, 1, 3],
       [0, 3, 1, 3]], dtype=int64)

In [78]:
import torch
cat_features=torch.tensor(cat_features, dtype=torch.int64)
cat_features 

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

## Categorical Features -- Embedding Layers
1. Label Encoding 
2. take all categorical features ---> numpy->torch-> tensor
3. lets take all the continuous values
4. continous variable --> numpy --> torch--> tensors
5. embedding layer 

In [93]:
continous_features=[]
for i in data.columns:
    if i in ['MSSubClass','MSZoning','Street','LotShape','SalePrice']:
        pass
    else:
        continous_features.append(i)

In [95]:
continous_features

['LotFrontage', 'LotArea', '1stFlrSF', '2ndFlrSF', 'TotalYears']

In [97]:
### stacking continous variables to tensor

In [108]:
cont_values=np.stack([data[i].values for i in continous_features],axis=1)
cont_values=torch.tensor(cont_values,dtype=torch.float)
cont_values
# data[i].values - will give values in the form of numpy arrays

tensor([[   65.,  8450.,   856.,   854.,    21.],
        [   80.,  9600.,  1262.,     0.,    48.],
        [   68., 11250.,   920.,   866.,    23.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    83.],
        [   68.,  9717.,  1078.,     0.,    74.],
        [   75.,  9937.,  1256.,     0.,    59.]])

In [114]:
y=torch.tensor(data['SalePrice'].values, dtype=torch.float).reshape(-1,1)
y

tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]])

In [116]:
data.info

<bound method DataFrame.info of       MSSubClass  MSZoning  LotFrontage  LotArea  Street  LotShape  1stFlrSF  \
0              5         3         65.0     8450       1         3       856   
1              0         3         80.0     9600       1         3      1262   
2              5         3         68.0    11250       1         0       920   
3              6         3         60.0     9550       1         0       961   
4              5         3         84.0    14260       1         0      1145   
...          ...       ...          ...      ...     ...       ...       ...   
1455           5         3         62.0     7917       1         3       953   
1456           0         3         85.0    13175       1         3      2073   
1457           6         3         66.0     9042       1         3      1188   
1458           0         3         68.0     9717       1         3      1078   
1459           0         3         75.0     9937       1         3      1256   

      2

## compare the number of columns from the original dataset

In [126]:
cat_features.shape, cont_values.shape, y.shape

(torch.Size([1201, 4]), torch.Size([1201, 5]), torch.Size([1201, 1]))

In [124]:
data.shape

(1201, 10)

## embedding technique

In [140]:
len(data['MSZoning'].unique())

5

In [142]:
cat_dims=[len(data[col].unique()) for col in ['MSSubClass','MSZoning','Street','LotShape']]
cat_dims

[15, 5, 2, 4]

In [None]:
### thumb - rule 
### - output dimension should be setbased on the input dimension (min(50,features dimension/2))
embedding_dim=[(x, min(50,(x+1)//2)) for x in cat_dims]