In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.utils import shuffle
import tensorflow as tf

In [None]:
city_name = '厦门'
raw_data = pd.read_csv('Data/model_data/' + city_name + '.csv')
split = int( 0.85*len(raw_data) )

train = raw_data[:split]
X_train = train.drop("modifiedUnitPrice", axis=1)
Y_train = train['modifiedUnitPrice']

test = raw_data[split:]
X_test = test.drop("modifiedUnitPrice", axis=1)
Y_actual = test['modifiedUnitPrice']

In [None]:
# linreg = LinearRegression()
# model = linreg.fit(X_train, Y_train)

poly_reg = PolynomialFeatures(degree = 2)
X_ploy = poly_reg.fit_transform(X_train)
linreg = LinearRegression()
model = linreg.fit(X_ploy,Y_train)

print(model.coef_)

In [None]:
predict_price = []
actual_price = []

for i in range(0, 100):
    n = np.random.randint(len(test))
    Y_pred = linreg.predict( poly_reg.fit_transform(X_test) )[n]
    predict_price.append(Y_pred)
    Y_label = Y_actual.tolist()[n]
    actual_price.append(Y_label)


In [None]:
plt.figure(figsize=(15, 12))
plt.plot(range(len(predict_price)),predict_price,'b',label="predict")
plt.plot(range(len(actual_price)),actual_price,'r',label="label")
plt.legend(loc="upper right")


In [None]:
# Data analysis and wrangling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline

# Machine learning
from sklearn.utils import shuffle
import tensorflow as tf

In [None]:
city_name = '厦门'
# Separate price label
raw_data = pd.read_csv('Data/model_data/' + city_name + '.csv')
X_train = raw_data.drop("modifiedUnitPrice", axis=1)
Y_train = raw_data['modifiedUnitPrice']

# Format into numpy array
x_data = np.array(X_train)
y_data = np.array(Y_train)
# Max and min value of price
former_y_min = np.min(y_data)  
former_y_max = np.max(y_data)

# Min-Max scale, missing value is transformed into 0 or 1 accordingly
for i in range(27):
    # All missing in the column
    if x_data[:,i].min() == -999 and x_data[:,i].max() == -999:
        x_data[:,i] = 0
        continue
    # No missing in the column
    elif x_data[:,i].min() != -999:
        x_data[:,i] = (x_data[:,i] - x_data[:,i].min())/(x_data[:,i].max() - x_data[:,i].min())
        continue
    
    # Find the actual min_value of the column, instead of -999
    former_x_min = 2147483647
    for j in range(len(x_data[:,i])):
        if x_data[:,i][j] == -999:
            continue
        elif former_x_min > x_data[:,i][j]:
            former_x_min = x_data[:,i][j]
    
    # Transform -999 into 1, and other value into [0,1]
    for j in range(len(x_data[:,i])):
        if x_data[:,i][j] == -999:
            x_data[:,i][j] = 1
        else:
            x_data[:,i][j] = (x_data[:,i][j] - former_x_min)/(x_data[:,i].max() - former_x_min)
        
# Transform house price into [0,1]
for i in range(len(y_data)):
    y_data[i] = (y_data[i] - former_y_min) / (former_y_max - former_y_min)
    
print(x_data)
    

In [None]:
x = tf.placeholder(tf.float32,[None,27],name = "X")
y = tf.placeholder(tf.float32,[None,1],name = "Y") 

# y = Wx + b, W is the feature matrix
with tf.name_scope('Model'):
    
    # w: random number, shape = (27, 1)
    w = tf.Variable(tf.random_normal([27, 1], stddev=0.01, name='W'), tf.float32)
    # b: 1.0
    b = tf.Variable(1.0, name='b')
    # matrix multiply
    def model(x, w, b):
        return tf.matmul(x, w) + b
    
    # Forward propagation
    pred = model(x, w, b)

In [None]:
train_epochs = 50
learning_rate = 0.01

with tf.name_scope('LossFunction'):
    loss_function = tf.reduce_mean(tf.pow(y-pred, 2))

optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss_function)

In [None]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [None]:
for epoch in range(train_epochs):
    loss_sum = 0.0
    for xs,ys in zip(x_data,y_data):
        
        xs = xs.reshape(1,27)
        ys = ys.reshape(1,1)
        # Feed data
        _,loss = sess.run([optimizer,loss_function],feed_dict={x:xs,y:ys})

        loss_sum = loss_sum + loss
    
    # Reorder data to avoid fake output
    x_data,y_data = shuffle(x_data,y_data)
    
    b0temp = b.eval(session=sess)           
    w0temp = w.eval(session=sess)            
    loss_average = loss_sum/len(y_data) 
    
    print("epoch=",epoch+1,"loss=",loss_average,"b=",b0temp,"w=",w0temp)


In [None]:
# Randomly pick out 100 records to predict and compare with label data

predict_price = []
actual_price = []
for i in range(0, 100):
    n = np.random.randint(len(raw_data))
    x_test = x_data[n]
    x_test = x_test.reshape(1,27)
    predict = sess.run(pred,feed_dict={x:x_test})
    
    predict_price.append(predict[0][0])
    actual_price.append(y_data[n])
#     print("预测值：" + str(predict[0][0]))
#     label = y_data[n]
#     print("标签值：" + str(label) + ', ' + str(np.array(Y_train)[n]))


plt.figure(figsize=(15, 12))
plt.plot(range(len(predict_price)),predict_price,'b',label="predict")
plt.plot(range(len(actual_price)),actual_price,'r',label="label")
plt.legend(loc="upper right")