In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error

In [3]:
train = pd.read_csv('/content/drive/MyDrive/Facultate/year2-sem2/data-vis/project/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Facultate/year2-sem2/data-vis/project/data/test.csv')

In [4]:
categorical_cols = ['Address', 'Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname']

In [None]:
for col in categorical_cols:
  train[col] = pd.factorize(train[col])[0]
  test[col] = pd.factorize(test[col])[0]

train['Date'] = pd.to_datetime(train['Date'], infer_datetime_format=True)
test['Date']= pd.to_datetime(test['Date'], infer_datetime_format=True)

train['Date'] = train['Date'].apply(lambda x:x.toordinal())
test['Date']= test['Date'].apply(lambda x:x.toordinal())

In [6]:
models = [SVR(), LinearRegression(), DecisionTreeRegressor(), linear_model.Lasso(), MLPRegressor(max_iter=1000)]
evaluation_df = pd.DataFrame(columns=['feature_names'] + models)
evaluation_df

Unnamed: 0,feature_names,SVR(),LinearRegression(),DecisionTreeRegressor(),Lasso(),MLPRegressor(max_iter=1000)


In [7]:
def build_eval_row(X_train, X_test, y_train, y_test, models, row_label):
  scores = [row_label]
  for model in models:
    classifier = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(f'{mean_squared_error(y_test, y_pred, squared=False):.2f}')
  return pd.DataFrame([scores], columns=['feature_names'] + models)

In [8]:
X_train, X_test = np.array(train.loc[:, train.columns != 'Price']), np.array(test.loc[:, train.columns != 'Price'])
y_train, y_test = np.array(train.loc[:, train.columns == 'Price']), np.array(test.loc[:, train.columns == 'Price'])

In [9]:
eval_row_proccessed = build_eval_row(X_train, X_test, y_train, y_test, models, 'All features preprocessed')
eval_row_proccessed

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,feature_names,SVR(),LinearRegression(),DecisionTreeRegressor(),Lasso(),MLPRegressor(max_iter=1000)
0,All features preprocessed,650382.41,450503.91,458424.15,450503.5,560468.02


In [10]:
train_orig = pd.read_csv('/content/drive/MyDrive/Facultate/year2-sem2/data-vis/project/data/train_orig.csv')
test_orig = pd.read_csv('/content/drive/MyDrive/Facultate/year2-sem2/data-vis/project/data/test_orig.csv')

In [14]:
train_orig = train_orig.fillna(train_orig.mean()).fillna(train_orig.mode().iloc[0])
test_orig = test_orig.fillna(test_orig.mean()).fillna(test_orig.mode().iloc[0])

In [15]:
train_orig

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,0,0,2,0,855000.0,0,0,736969,3.4,3031.0,...,1.000000,1.709092,110.000000,79.000000,1910.000000,0,-37.793720,144.928460,0,5263.0
1,1,1,1,1,421000.0,1,1,736231,7.9,3079.0,...,1.000000,1.000000,0.000000,58.000000,1992.000000,1,-37.775300,145.041900,1,5549.0
2,2,2,2,0,800000.0,2,2,736098,2.6,3121.0,...,1.587345,1.709092,604.594338,158.810125,1966.588047,2,-37.807081,144.996236,0,14949.0
3,3,3,3,0,1725000.0,2,3,736581,15.2,3191.0,...,1.000000,2.000000,493.000000,117.000000,1915.000000,3,-37.949010,145.011670,2,4497.0
4,4,4,3,0,1345000.0,2,4,736545,6.5,3071.0,...,2.000000,4.000000,584.000000,158.810125,1966.588047,4,-37.755500,145.015200,0,8870.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21792,4,17664,3,0,1720000.0,1,1,736644,7.0,3071.0,...,1.000000,6.000000,664.000000,179.000000,1915.000000,4,-37.760200,145.005400,0,8870.0
21793,243,21467,3,0,295000.0,1,115,736476,31.7,3337.0,...,1.000000,4.000000,600.000000,122.860000,1975.000000,21,-37.687340,144.580010,7,3600.0
21794,1,21468,3,2,850000.0,1,31,736623,7.8,3079.0,...,1.587345,1.709092,604.594338,158.810125,1966.588047,1,-37.807081,144.996236,1,5549.0
21795,124,21469,3,0,2550000.0,2,6,736567,10.2,3147.0,...,2.000000,3.000000,1170.000000,202.000000,1950.000000,23,-37.872530,145.070500,2,3052.0


In [16]:
for col in categorical_cols:
  train_orig[col] = pd.factorize(train_orig[col])[0]
  test_orig[col] = pd.factorize(test_orig[col])[0]

train_orig['Date'] = pd.to_datetime(train_orig['Date'], infer_datetime_format=True)
test_orig['Date']= pd.to_datetime(test_orig['Date'], infer_datetime_format=True)

train_orig['Date'] = train_orig['Date'].apply(lambda x:x.toordinal())
test_orig['Date']= test_orig['Date'].apply(lambda x:x.toordinal())

In [17]:
X_train, X_test = np.array(train_orig.loc[:, train_orig.columns != 'Price']), np.array(test_orig.loc[:, test_orig.columns != 'Price'])
y_train, y_test = np.array(train_orig.loc[:, train_orig.columns == 'Price']), np.array(test_orig.loc[:, test_orig.columns == 'Price'])

In [18]:
eval_row_orig = build_eval_row(X_train, X_test, y_train, y_test, models, 'All features no processing')
eval_row_orig

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,feature_names,SVR(),LinearRegression(),DecisionTreeRegressor(),Lasso(),MLPRegressor(max_iter=1000)
0,All features no processing,650382.41,454490.48,453435.15,454490.68,546424.16


In [19]:
final_df = evaluation_df.append([
    eval_row_orig, 
    eval_row_proccessed
])
final_df

  final_df = evaluation_df.append([


Unnamed: 0,feature_names,SVR(),LinearRegression(),DecisionTreeRegressor(),Lasso(),MLPRegressor(max_iter=1000)
0,All features no processing,650382.41,454490.48,453435.15,454490.68,546424.16
0,All features preprocessed,650382.41,450503.91,458424.15,450503.5,560468.02
