# Getting the weights for the regressor

In [1]:
import pandas as pd
import numpy as np
import json
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

### importing data

In [2]:
# Opening JSON file
with open('apartments.json') as json_file:
    apartments = json.load(json_file)
 
    # Print the type of data variable
    print("Type:", type(apartments))

Type: <class 'list'>


### making the DataFrame

In [3]:
apts = []
for apartment in apartments:
    apt = {}
    apt["lat"] = apartment["lat"]
    apt["long"] = apartment["long"]
    apt["date"] = apartment["date"]
    apt["sq_mt"] = apartment["sq_mt"]
    apt["rooms"] = apartment["rooms"]
    apt["price"] = apartment["price"]
    apts.append(apt)

In [4]:
apt_df =pd.DataFrame(apts)

In [5]:
apt_df

Unnamed: 0,lat,long,date,sq_mt,rooms,price
0,44.770200,20.419700,29.08.2022.,134.00,4.5,339000
1,44.798600,20.472300,29.08.2022.,60.00,2.0,169000
2,44.793400,20.492800,29.08.2022.,110.00,3.5,239000
3,44.843383,20.484003,29.08.2022.,49.00,2.5,67800
4,44.755283,20.453176,29.08.2022.,40.00,1.5,73000
...,...,...,...,...,...,...
469,44.776801,20.532719,01.08.2022.,51.82,3.0,101049
470,44.805053,20.474391,01.08.2022.,66.00,3.0,208000
471,44.792964,20.492485,31.07.2022.,42.00,1.0,130000
472,44.795135,20.437267,31.07.2022.,110.00,5.0,140000


In [6]:
# plt.plot(apt_df.long, apt_df.lat, ".")

### Preparing data for ML

In [7]:
X = apt_df.drop(["price", "date"], axis=1).to_numpy()
y = apt_df.price.to_numpy()

#### Linear regressor

In [8]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

print('linear model coeff (w): {}'
     .format(linreg.coef_))
print('linear model intercept (b): {:.3f}'
     .format(linreg.intercept_))
print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))

linear model coeff (w): [601494.25997073 138160.25398513   5720.21981834 -77965.5288013 ]
linear model intercept (b): -29791316.084
R-squared score (training): 0.737
R-squared score (test): 0.472


#### Linear regressor with ridge and data scaling

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state = 0)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

print('Crime dataset')
print('ridge regression linear model intercept: {}'
     .format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linridge.coef_))
print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linridge.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'
     .format(np.sum(linridge.coef_ != 0)))

Crime dataset
ridge regression linear model intercept: 63749.612928978866
ridge regression linear model coeff:
[ 29926.31640935  -9006.59321678 254823.17930099 218096.23811708]
R-squared score (training): 0.293
R-squared score (test): 0.344
Number of non-zero features: 4


#### finding the best alpha parameter

In [10]:
print('Ridge regression: effect of alpha regularization parameter\n')
for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:
    linridge = Ridge(alpha = this_alpha).fit(X_train_scaled, y_train)
    r2_train = linridge.score(X_train_scaled, y_train)
    r2_test = linridge.score(X_test_scaled, y_test)
    num_coeff_bigger = np.sum(abs(linridge.coef_) > 1.0)
    print('Alpha = {:.2f}\nnum abs(coeff) > 1.0: {}, \
r-squared training: {:.2f}, r-squared test: {:.2f}\n'
         .format(this_alpha, num_coeff_bigger, r2_train, r2_test))
linridge.coef_

Ridge regression: effect of alpha regularization parameter

Alpha = 0.00
num abs(coeff) > 1.0: 4, r-squared training: 0.74, r-squared test: 0.47

Alpha = 1.00
num abs(coeff) > 1.0: 4, r-squared training: 0.66, r-squared test: 0.56

Alpha = 10.00
num abs(coeff) > 1.0: 4, r-squared training: 0.38, r-squared test: 0.41

Alpha = 20.00
num abs(coeff) > 1.0: 4, r-squared training: 0.29, r-squared test: 0.34

Alpha = 50.00
num abs(coeff) > 1.0: 4, r-squared training: 0.19, r-squared test: 0.23

Alpha = 100.00
num abs(coeff) > 1.0: 4, r-squared training: 0.12, r-squared test: 0.14

Alpha = 1000.00
num abs(coeff) > 1.0: 4, r-squared training: 0.01, r-squared test: 0.00



array([  689.58641122,  -202.49696275,  7421.00072051, 10360.88115458])

#### Using the best alpha parameter

In [11]:
linridge = Ridge(alpha = 1).fit(X_train_scaled, y_train)
r2_train = linridge.score(X_train_scaled, y_train)
r2_test = linridge.score(X_test_scaled, y_test)
predictions = linridge.predict(X_test_scaled)

In [12]:
new_df = pd.DataFrame()

In [13]:
new_df["predictions"] = predictions
new_df["y_test"] = y_test

In [14]:
new_df.head(50)

Unnamed: 0,predictions,y_test
0,301565.913992,249000
1,254082.855715,175000
2,125426.90527,250000
3,39868.694094,88000
4,219266.662621,305000
5,98507.335149,45000
6,88075.338908,125000
7,319641.890009,350104
8,150689.416302,149000
9,113847.432341,120000


In [15]:
X_train_scaled
linridge.coef_

array([ 189080.88062186,   11720.96837901, 1605422.31442403,
         -9944.09381746])

In [16]:
linridge.intercept_

-190078.00275754

In [17]:
X_test_scaled[0]

array([0.68089477, 0.53179326, 0.2262931 , 0.66666667])

In [18]:
y_test[0]

249000

In [19]:
sum1 = []
for i in range(4):

    print(X_test_scaled[0][i])
    print(linridge.coef_[i])
    print("==================")
    mul_res = X_test_scaled[0][i]*linridge.coef_[i]
    print(mul_res)
    print("******************")
    sum1.append(mul_res)
print(sum1)

0.6808947700063044
189080.88062185762
128744.18272360924
******************
0.5317932637487885
11720.968379006605
6233.13202856827
******************
0.22629310344827586
1605422.3144240326
363295.99787612807
******************
0.6666666666666665
-9944.093817455787
-6629.395878303856
******************
[128744.18272360924, 6233.13202856827, 363295.99787612807, -6629.395878303856]


In [21]:
sum(sum1)+linridge.intercept_

301565.91399246175

### Exporting the coeffs and intercept

In [22]:
weights = {}
weights["intercept"] = linridge.intercept_

In [24]:
weights["coefficients"] = list(linridge.coef_)

In [26]:
with open('weights.json', 'w') as file:
    json.dump(weights, file)