In [1]:
from google.colab import drive

drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import pandas
import numpy as np
import regex as re

df = pandas.read_csv('/content/gdrive/MyDrive/Colab Notebooks/sim/train_dataset.csv')
df['price_vnd'] = df['price_vnd'].astype(int)
df['sim_number'] = df['sim_number'].astype(str)

label = []
for i in df['price_vnd']:
    if i <= 450000:
        label.append(0)
    elif i <= 750000:
        label.append(1)
    elif i <= 1500000:
        label.append(2)
    elif i <= 4000000:
        label.append(3)
    elif i <= 7500000:
        label.append(4)
    elif i <= 30000000:
        label.append(5)
    else:
        label.append(6)
df['label'] = label
X = []
for i in df['sim_number']:
    b = np.zeros(shape=(9, 10))
    de = 0
    for p in i: 
        a = np.zeros(shape=(10))
        a[int(p)] = 1
        b[de] = a 
        de += 1
    X.append(b)
X = np.array(X)

from sklearn.model_selection import train_test_split
y = df['label']
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
sim_train = []
for st in X_train:
    s = ''
    for i in np.argmax(st, axis = 1):
        s += str(i)
    sim_train.append(s)
df_train = df.set_index('sim_number').loc[sim_train].reset_index()
df_train.head()

Unnamed: 0,sim_number,price_vnd,label
0,908888939,56000000,6
1,988275577,5000000,4
2,329143188,450000,0
3,344240682,450000,0
4,343128345,450000,0


In [4]:
sim_test = []
for st in X_test:
    s = ''
    for i in np.argmax(st, axis = 1):
        s += str(i)
    sim_test.append(s)
df_test = df.set_index('sim_number').loc[sim_test].reset_index()
df_test.head()

Unnamed: 0,sim_number,price_vnd,label
0,862935693,450000,0
1,866231006,450000,0
2,338833595,10000000,5
3,336043199,450000,0
4,982409498,500000,1


In [5]:
from tensorflow import keras
from keras.layers import LSTM, Bidirectional, TimeDistributed, Conv1D, MaxPooling1D, BatchNormalization

model = keras.models.Sequential()

model.add(Conv1D(64, 2, activation='relu', padding='same', input_shape=(9,10)))
model.add(Conv1D(128, 2, activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2))

model.add(Bidirectional(LSTM(200, return_sequences=True),
                             input_shape=(9,10)))

model.add(keras.layers.Dropout(0.2))

model.add(Bidirectional(LSTM(200)))

model.add(keras.layers.Dropout(0.2))

model.add(keras.layers.Dense(7, activation='softmax'))

model.compile( loss='sparse_categorical_crossentropy', optimizer = keras.optimizers.Adam(), metrics=['accuracy'] )


In [6]:
model.load_weights('/content/gdrive/MyDrive/Colab Notebooks/sim/model-7-class.h5')
aa = model.predict(X_test, batch_size = 64)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
y_pred = np.argmax(aa, axis=1)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
df_test['pred'] = y_pred

[[8709  251   63   53   56    4    0]
 [ 475 4930  525  116   28    8    1]
 [ 204  568 6401  380   66   43    3]
 [  68  143  310 4095 1035  259    5]
 [  56   39   69 1014 2449  553    7]
 [  11   19  157  253  508 4234   68]
 [   0    2    5   13   12   92 1640]]
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      9136
           1       0.83      0.81      0.82      6083
           2       0.85      0.84      0.84      7665
           3       0.69      0.69      0.69      5915
           4       0.59      0.58      0.59      4187
           5       0.82      0.81      0.81      5250
           6       0.95      0.93      0.94      1764

    accuracy                           0.81     40000
   macro avg       0.81      0.80      0.80     40000
weighted avg       0.81      0.81      0.81     40000



In [7]:
from keras.models import Model

layer_output = model.layers[-3].output
intermediate_model = Model(inputs=model.input,outputs=layer_output)
intermediate_prediction = intermediate_model.predict(X_train, batch_size = 64)
tmp = np.matrix(intermediate_prediction)
df_train = pandas.concat([df_train, pandas.DataFrame(tmp)], axis=1)



In [8]:
intermediate_prediction = intermediate_model.predict(X_test,  batch_size = 64)
tmp = np.matrix(intermediate_prediction)
df_test = pandas.concat([df_test, pandas.DataFrame(tmp)], axis=1)



In [13]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error as MSE

def KNN(label, neighbors):
    tr = df_train[df_train['label'] == label]
    te = df_test[df_test['pred'] == label]
    X_train = tr.drop(columns=['label', 'price_vnd', 'sim_number']).values
    y_train = tr['price_vnd'].values
    X_test = te.drop(columns=['label', 'price_vnd', 'sim_number', 'pred']).values
    y_test = te['price_vnd'].values

    linear = KNeighborsRegressor(n_neighbors=neighbors)
    linear.fit(X_train, y_train)
    pred = linear.predict(X_test)

    rmse = np.sqrt(MSE(y_test, pred))
    print("RMSE : % f" %(rmse))
    return dict(zip(te['sim_number'].values, pred))

predict_label_5 = KNN(5, 5)
predict_label_6 = KNN(6, 5)

RMSE :  15220607.878067
RMSE :  409147831.065657


In [14]:
from sklearn.metrics import mean_absolute_error

final_predict = []
for index, row in df_test.iterrows():
    if row['pred'] == 0:
        final_predict.append(450000)
    elif row['pred'] == 1:
        final_predict.append(500000)
    elif row['pred'] == 2:
        final_predict.append(1000000)
    elif row['pred'] == 3:
        final_predict.append(3000000)
    elif row['pred'] == 4:
        final_predict.append(5000000)
    elif row['pred'] == 5:
        final_predict.append(predict_label_5[row['sim_number']])
    else:
        final_predict.append(predict_label_6[row['sim_number']])
        
rmse = np.sqrt(MSE(final_predict, df_test['price_vnd'].values))
print("RMSE : % f" %(rmse))

mae = mean_absolute_error(final_predict, df_test['price_vnd'].values)
print('MAE: %.3f' % mae)

RMSE :  88892168.388900
MAE: 5597789.599
