# Final Project

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

In [4]:
data = pd.read_csv("/Users/kozhahmet/Desktop/202212-08-Real-Estate-Price-Prediction-main/datasets/data_cleaned.csv", index_col=0)

In [5]:
# Split X, y into test and training data.

X = data.drop(columns="price")           
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Split X, y.

X = data.drop(columns="price")           
y = data["price"]

# 1) Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
score_lr = r2_score(y_test, y_pred)

print(score_lr)

# 2) Decision Tree Regressor

In [None]:
#Basic situation using mse criterion(squared_error)
DT = DecisionTreeRegressor(criterion='squared_error')
DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)
print("DecisionTreeRegressor R2 Score:", r2_score(y_test, y_pred))
print("DecisionTreeRegressor MSE:", mean_squared_error(y_test, y_pred))

In [None]:
#Finding the best max_depth for model
test = []
for i in range(20):
  DTR = DecisionTreeRegressor(criterion='squared_error', splitter='best', max_depth = i+1)
  DTR.fit(X_train, y_train)
  y_pred = DTR.predict(X_test)
  score = r2_score(y_test, y_pred)
  test.append(score)

plt.figure(figsize = (20,10), dpi = 80)
plt.plot(range(1,21), test)
plt.show()

In [None]:
#Finding the best min_samples_split for model
test = []
for i in range(10):
  DTR = DecisionTreeRegressor(criterion='squared_error', splitter='best', max_depth = 8, min_samples_split=i+2)
  DTR.fit(X_train, y_train)
  y_pred = DTR.predict(X_test)
  score = r2_score(y_test, y_pred)
  test.append(score)

plt.figure(figsize = (20,10), dpi = 80)
plt.plot(range(1,11), test)
plt.show()

In [None]:
# Finding the best min_samples_leaf for model

test = []
for i in range(5):
  DTR = DecisionTreeRegressor(criterion='squared_error', splitter='best', max_depth = 8, min_samples_split=3, min_samples_leaf=i+1)
  DTR.fit(X_train, y_train)
  y_pred = DTR.predict(X_test)
  score = r2_score(y_test, y_pred)
  test.append(score)

plt.figure(figsize = (20,10), dpi = 80)
plt.plot(range(1,6), test)
plt.show()

In [None]:
# Final Performance

DT = DecisionTreeRegressor(criterion='squared_error', splitter='best', max_depth = 8, min_samples_split=3, min_samples_leaf=4)
DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)

print("DecisionTreeRegressor R2 Score:", r2_score(y_test, y_pred))
print("DecisionTreeRegressor MSE:", mean_squared_error(y_test, y_pred))

# 3) KNN

In [None]:
# Standardizing our features except target variable.

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
stand_scale = data.drop(['price'],axis = 1)
col_trans = make_column_transformer(
            (StandardScaler(), stand_scale.columns),
            remainder = 'passthrough')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

knn_scores = []
for k in range(1, 31):
    knn_classifier = KNeighborsClassifier(n_neighbors = k)
    pipe = make_pipeline(col_trans, knn_classifier)
    knn_scores.append(cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean())

In [None]:
plt.figure(figsize=(16,16))
plt.plot([k for k in range(1, 31)], knn_scores, color = 'red')
for i in range(1,31):
    plt.text(i, knn_scores[i-1], (i, round(knn_scores[i-1]*100,2)))
plt.xticks([i for i in range(1, 31)])
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Scores')
plt.title('K Neighbors Classifier scores for different K values')

In [None]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

KN_model = KNeighborsClassifier(n_neighbors = 1)
KN_model = make_pipeline(col_trans, KN_model)
KN_model.fit(X_train, y_train)

y_pred = KN_model.predict(X_test)

print('Accuracy score: {:.4f}%.'.format(metrics.accuracy_score(y_test,y_pred)*100))

# Random Forest Req

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print('R2: {:.2f}%.'.format(r2_score(y_test, y_pred)*100))

# XGB Regressor

In [None]:
!pip3 install xgboost

In [None]:
from xgboost import XGBRegressor

In [None]:
XG = XGBRegressor()
XG.fit(X_train, y_train)
y_pred = XG.predict(X_test)

In [None]:
#Finding the best n_estimators for model
test = []
for i in range(10):
  XGBR = XGBRegressor(n_estimators = i*25 + 1600)
  XGBR.fit(X_train, y_train)
  y_pred = XGBR.predict(X_test)
  score = r2_score(y_test, y_pred)
  test.append(score)

plt.figure(figsize = (20,10), dpi = 80)
plt.plot(range(1,11), test)
plt.show()

In [None]:
#Finding the best max_depth for model
test = []
for i in range(10):
  XGBR = XGBRegressor(n_estimators = 1625, max_depth = 3 + i)
  XGBR.fit(X_train, y_train)
  y_pred = XGBR.predict(X_test)
  score = r2_score(y_test, y_pred)
  test.append(score)

plt.figure(figsize = (20,10), dpi = 80)
plt.plot(range(1,11), test)
plt.show()

In [None]:
#Finding the best min_child_weight for model
test = []
for i in range(5):
  XGBR = XGBRegressor(n_estimators = 1625, max_depth = 5, min_child_weight = 4, learning_rate = 0.01*i + 0.01)
  XGBR.fit(X_train, y_train)
  y_pred = XGBR.predict(X_test)
  score = r2_score(y_test, y_pred)
  test.append(score)

plt.figure(figsize = (20,10), dpi = 80)
plt.plot(range(1,6), test)
plt.show()

In [None]:
XG = XGBRegressor(n_estimators = 1625, max_depth = 5, min_child_weight = 4, learning_rate = 0.03)
XG.fit(X_train, y_train)
y_pred = XG.predict(X_test)
print("XGBRegressor score:", XG.score(X_test, y_test))
print("XGBRegressor MSE:", mean_squared_error(y_test, y_pred))

In [None]:
import torch
import torchvision.transforms as transforms

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


trainset = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=2)

In [None]:
import torchvision.transforms


trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True, num_workers=2)

testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=False, num_workers=2)

In [None]:
for epoch in range(40):

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = net(inputs)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 50 == 49:
            print('Epoch -', epoch + 1, 'and', i + 1, '=> loss: %.3f' % (running_loss / 50))
            running_loss = 0.0

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy is %d %%' % (100 * correct / total))

In [30]:
import torch, torch.nn as nn

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.bn1 = nn.BatchNorm2d(6)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.bn2 = nn.BatchNorm2d(16)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        self.conv1 = nn.Conv2d(1, 6, kernel_size=(3, 3))

    def forward(self, x):
        x = x.float()  # convert input tensor to float
        x = self.conv1(x)
        x = self.bn1(x) 
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x


net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

In [11]:
import torch
import torchvision.transforms as transforms

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


# trainset = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
trainset = torch.utils.data.TensorDataset(torch.from_numpy(X_train.values), torch.from_numpy(y_train.values))

trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

# testset = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
testset = torch.utils.data.TensorDataset(torch.from_numpy(X_test.values), torch.from_numpy(y_test.values))
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=2)

In [12]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


In [13]:
import torchvision.transforms


trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True, num_workers=2)

testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=False, num_workers=2)

In [39]:
import torch.nn.functional as F

for epoch in range(10):

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data

        optimizer.zero_grad()
        
        inputs = inputs.unsqueeze(2).unsqueeze(3)
        inputs = inputs.permute(0, 3, 2, 1)
        transform = torchvision.transforms.Resize((28, 28))
        inputs = transform(inputs)

        outputs = net(inputs)
        
#         loss = criterion(outputs, labels.long())
#         loss.backward()
#         optimizer.step()

#         # print statistics
#         running_loss += loss.item()
#         if i % 50 == 49:
#             print('Epoch -', epoch + 1, 'and', i + 1, '=> loss: %.3f' % (running_loss / 50))
#             running_loss = 0.0

In [42]:
import torch.nn.functional as F

for epoch in range(10):

    running_loss = 0.0
    total = 0
    correct = 0
    
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data

        optimizer.zero_grad()
        
        if inputs.dim() == 4:
            # Reshape the tensor to add an extra dimension for the batch size
            inputs = inputs.unsqueeze(0)

        transform = torchvision.transforms.Resize((28, 28))
        inputs = transform(inputs)

        # Remove the unnecessary calls to unsqueeze() and permute()
        outputs = net(inputs)
        
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 2000 == 1999:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

    accuracy = 100 * correct / total
    print('Epoch %d accuracy: %d %%' % (epoch + 1, accuracy))


ValueError: Input and output must have the same number of spatial dimensions, but got input with spatial dimensions of [18] and output size of [28, 28]. Please provide input tensor in (N, C, d1, d2, ...,dK) format and output size in (o1, o2, ...,oK) format.

In [43]:
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        inputs = inputs.unsqueeze(2).unsqueeze(3)
        inputs = inputs.permute(0, 3, 2, 1)
        transform = torchvision.transforms.Resize((28, 28))
        inputs = transform(inputs)
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy is %d %%' % (100 * correct / total))


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 6 is not equal to len(dims) = 4