In [1]:
import pandas as pd

In [2]:
info = pd.read_csv('predicting-office-space-price-testcases/input/input03.txt', 
                   delimiter=' ', nrows=1, names=['features','records'])

In [3]:
info

Unnamed: 0,features,records
0,2,100


In [4]:
F = info.iloc[0,0]
F

2

In [5]:
N = info.iloc[0,1]
N

100

In [6]:
train = pd.read_csv('predicting-office-space-price-testcases/input/input03.txt', 
                   delimiter=' ', skiprows=1, nrows=N, header=None)

In [7]:
train

Unnamed: 0,0,1,2
0,0.44,0.68,511.14
1,0.99,0.23,717.10
2,0.84,0.29,607.91
3,0.28,0.45,270.40
4,0.07,0.83,289.88
...,...,...,...
95,0.99,0.13,636.22
96,0.28,0.46,272.12
97,0.87,0.36,696.65
98,0.23,0.87,434.53


In [8]:
# Slicing train features
X_train = train.iloc[:,0:F]
X_train

Unnamed: 0,0,1
0,0.44,0.68
1,0.99,0.23
2,0.84,0.29
3,0.28,0.45
4,0.07,0.83
...,...,...
95,0.99,0.13
96,0.28,0.46
97,0.87,0.36
98,0.23,0.87


In [9]:
# Slicing train target variable
y_train = train.iloc[:,F]
y_train

0     511.14
1     717.10
2     607.91
3     270.40
4     289.88
       ...  
95    636.22
96    272.12
97    696.65
98    434.53
99    593.86
Name: 2, Length: 100, dtype: float64

In [10]:
# The following row in CSV file will contain the number of test cases.
# This row can be skipped and then all the rest of lines read to dataframe.

X_test = pd.read_csv('predicting-office-space-price-testcases/input/input03.txt', 
                   delimiter=' ', skiprows=1+N+1, header=None)

X_test

Unnamed: 0,0,1
0,0.05,0.54
1,0.91,0.91
2,0.31,0.76
3,0.51,0.31


In [11]:
# From the outputs test file read the test cases target variable values
y_test = pd.read_csv('predicting-office-space-price-testcases/output/output03.txt', 
                   delimiter=' ', header=None)

y_test

Unnamed: 0,0
0,180.38
1,1312.07
2,440.13
3,343.72


# Machine learning

Problem statement says:

The prices per square foot, are (approximately) a polynomial function of the features in the observation table.
This polynomial always has an order less than 4.

### SVR with polynomial kernel and degree 3 (default).

In [12]:
from sklearn.svm import SVR

In [13]:
svr_poly = SVR(kernel='poly', degree=3)

In [14]:
svr_poly.fit(X_train, y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='poly', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [15]:
svr_poly.score(X_train, y_train)

0.0064965312257381225

In [16]:
svr_poly.score(X_test, y_test)

0.006145508978183112

### SVR with rbf kernel and degree 3 (default)

In [17]:
svr_rbf = SVR(kernel='rbf', degree=3)
svr_rbf.fit(X_train, y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [18]:
svr_rbf.score(X_train, y_train)

0.019990861546749783

In [19]:
svr_rbf.score(X_test, y_test)

0.013445417282863968

### Linear regression with polyniomial features

In [20]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly_reg = PolynomialFeatures(degree = 3)
X_poly = poly_reg.fit_transform(X_train)
lin_reg = LinearRegression()
lin_reg.fit(X_poly, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
lin_reg.score(X_poly, y_train)

0.9998382728466821

In [22]:
# Test model on test data.

X_poly_test = poly_reg.fit_transform(X_test)

lin_reg.score(X_poly_test, y_test)


0.999999999926814