# Import

In [5]:
# %pip install tensorflow
# %pip install tensorflowjs

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import json
import os
from tensorflow.keras import layers, models

# Read CSV

In [9]:
# Specify the path to the CSV file
file_path = 'house_price_jabodetabek_dataset.csv'

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Print the Dataframe
df

Unnamed: 0,District,City,Latitude,Longitude,Price,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage
0,Babelan,Bekasi,-6.227721,107.001649,3.450000e+08,60.0,40.0,1.0,2.0,1.0,1.0
1,Pondokmelati,Bekasi,-6.296441,106.921566,1.265000e+09,90.0,110.0,2.0,3.0,3.0,2.0
2,Bekasi Utara,Bekasi,-6.211765,107.003851,1.300000e+09,80.0,120.0,2.0,3.0,3.0,2.0
3,Cikarang Pusat,Bekasi,-6.365165,107.182391,1.090000e+09,98.0,102.0,2.0,4.0,2.0,2.0
4,Mustikajaya,Bekasi,-6.303947,107.026896,8.000000e+08,96.0,75.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
10176,Pantai Mutiara,Jakarta Utara,-6.107730,106.791458,1.850000e+10,583.0,1300.0,1.0,5.0,6.0,1.0
10177,Kemang,Jakarta Selatan,-6.277783,106.811422,5.400000e+09,156.0,262.0,4.0,4.0,3.0,2.0
10178,Daan Mogot,Jakarta Barat,-6.170157,106.762833,1.700000e+09,90.0,120.0,2.0,2.0,2.0,1.0
10179,Semanan,Jakarta Barat,-6.164927,106.705773,2.800000e+09,120.0,136.0,2.0,3.0,3.0,2.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10181 entries, 0 to 10180
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   District          10181 non-null  object 
 1   City              10181 non-null  object 
 2   Latitude          10181 non-null  float64
 3   Longitude         10181 non-null  float64
 4   Price             10181 non-null  float64
 5   Land_Size_M2      10181 non-null  float64
 6   Building_Size_M2  10181 non-null  float64
 7   Floors            10181 non-null  float64
 8   Bedrooms          10181 non-null  float64
 9   Bathrooms         10181 non-null  float64
 10  Carport/Garage    10181 non-null  float64
dtypes: float64(9), object(2)
memory usage: 875.1+ KB


# Export Unique District and City

In [11]:
import pandas as pd

# Create new column "City, District"
df['City, District'] = df.apply(lambda row: f"{row['City']}, {row['District']}", axis=1)

# Ensure uniqueness of values in "City, District"
df_unique = df[['City, District']].drop_duplicates()

# Export "City, District" column to CSV file
df_unique.to_csv('encoding/city_district.csv', index=False)

print("CSV file 'city_district.csv' has been created with unique 'City, District' values.")

# Drop "City, District" column inplace from original DataFrame
df.drop(columns=['City, District'], inplace=True)

# Display the updated DataFrame (after dropping the column)
df

CSV file 'city_district.csv' has been created with unique 'City, District' values.


Unnamed: 0,District,City,Latitude,Longitude,Price,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage
0,Babelan,Bekasi,-6.227721,107.001649,3.450000e+08,60.0,40.0,1.0,2.0,1.0,1.0
1,Pondokmelati,Bekasi,-6.296441,106.921566,1.265000e+09,90.0,110.0,2.0,3.0,3.0,2.0
2,Bekasi Utara,Bekasi,-6.211765,107.003851,1.300000e+09,80.0,120.0,2.0,3.0,3.0,2.0
3,Cikarang Pusat,Bekasi,-6.365165,107.182391,1.090000e+09,98.0,102.0,2.0,4.0,2.0,2.0
4,Mustikajaya,Bekasi,-6.303947,107.026896,8.000000e+08,96.0,75.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
10176,Pantai Mutiara,Jakarta Utara,-6.107730,106.791458,1.850000e+10,583.0,1300.0,1.0,5.0,6.0,1.0
10177,Kemang,Jakarta Selatan,-6.277783,106.811422,5.400000e+09,156.0,262.0,4.0,4.0,3.0,2.0
10178,Daan Mogot,Jakarta Barat,-6.170157,106.762833,1.700000e+09,90.0,120.0,2.0,2.0,2.0,1.0
10179,Semanan,Jakarta Barat,-6.164927,106.705773,2.800000e+09,120.0,136.0,2.0,3.0,3.0,2.0


: 

# Encoding

## Encoding District

Ini encoding berdasarkan median harga dari district tersebut

In [4]:
# Step 1: Calculate the median of "price_in_rp" for each district
district_medians = df.groupby('District')['Price'].median()

# Step 2: Sort the categories by their median "price_in_rp" in descending order
sorted_categories_district = district_medians.sort_values(ascending=False).index

# Step 3: Create a dictionary for encoding based on sorted order
encoding_district_dict = {district: idx for idx, district in enumerate(sorted_categories_district, start=1)}

# Print the encoding dictionary
print("Encoding Dictionary:", encoding_district_dict)

# Create a folder named "encoding" if it doesn't exist
encoding_folder = 'encoding'
if not os.path.exists(encoding_folder):
    os.makedirs(encoding_folder)

# Export the encoding to a JSON file in the "encoding" folder
json_file_path = os.path.join(encoding_folder, 'encoding_district_dict.json')
with open(json_file_path, 'w') as json_file:
    json.dump(encoding_district_dict, json_file, indent=4)

print(f"Dictionary has been exported to {json_file_path}")

# Step 4: Encode the "district" column using the encoding dictionary
df['District'] = df['District'].map(encoding_district_dict)

df

Encoding Dictionary: {'Menteng Atas': 1, 'Hasyim Ashari': 2, 'Mega Kuningan': 3, 'Menteng': 4, 'Senayan': 5, 'Scbd': 6, 'Blok M': 7, 'Simprug': 8, 'Permata Hijau': 9, 'Prapanca': 10, 'Slipi': 11, 'Senopati': 12, 'Menteng Dalam': 13, 'Kuningan': 14, 'Patra Kuningan': 15, 'Pakubuwono': 16, 'Sudirman': 17, 'Kebayoran Baru': 18, 'Bangka': 19, 'Pejompongan': 20, 'Pantai Mutiara': 21, 'Ancol': 22, 'Kebon Kacang': 23, 'Pondok Indah': 24, 'Lenteng Agung': 25, 'Bendungan Hilir': 26, 'Setia Budi': 27, 'Setiabudi': 28, 'Fatmawati': 29, 'Ragunan': 30, 'Cipulir': 31, 'Kedoya': 32, 'Cipete': 33, 'Guntur': 34, 'Intercon': 35, 'Ciganjur': 36, 'Mampang Prapatan': 37, 'Pulomas': 38, 'Tanjung Duren Utara': 39, 'Ampera': 40, 'Pantai Indah Kapuk': 41, 'Patal Senayan': 42, 'Panglima Polim': 43, 'Pluit': 44, 'Kebayoran Lama': 45, 'Tawakal': 46, 'Lebak Bulus': 47, 'Kedoya Baru': 48, 'Gajah Mada': 49, 'Tanah Abang': 50, 'Harmoni': 51, 'Pondok Hijau Golf': 52, 'Tanjung Duren Selatan': 53, 'Veteran': 54, 'Angke'

Unnamed: 0,District,City,Latitude,Longitude,Price,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage
0,294,Bekasi,-6.227721,107.001649,3.450000e+08,60.0,40.0,1.0,2.0,1.0,1.0
1,211,Bekasi,-6.296441,106.921566,1.265000e+09,90.0,110.0,2.0,3.0,3.0,2.0
2,273,Bekasi,-6.211765,107.003851,1.300000e+09,80.0,120.0,2.0,3.0,3.0,2.0
3,267,Bekasi,-6.365165,107.182391,1.090000e+09,98.0,102.0,2.0,4.0,2.0,2.0
4,275,Bekasi,-6.303947,107.026896,8.000000e+08,96.0,75.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
10176,21,Jakarta Utara,-6.107730,106.791458,1.850000e+10,583.0,1300.0,1.0,5.0,6.0,1.0
10177,60,Jakarta Selatan,-6.277783,106.811422,5.400000e+09,156.0,262.0,4.0,4.0,3.0,2.0
10178,171,Jakarta Barat,-6.170157,106.762833,1.700000e+09,90.0,120.0,2.0,2.0,2.0,1.0
10179,173,Jakarta Barat,-6.164927,106.705773,2.800000e+09,120.0,136.0,2.0,3.0,3.0,2.0


# Encoding City

Ini encoding berdasarkan median harga dari city tersebut

In [5]:
# Step 1: Calculate the median of "price_in_rp" for each city
city_medians = df.groupby('City')['Price'].median()

# Step 2: Sort the categories by their median "price_in_rp" in descending order
sorted_categories_city = city_medians.sort_values(ascending=False).index

# Step 3: Create a dictionary for encoding based on sorted order
encoding_city_dict = {city: idx for idx, city in enumerate(sorted_categories_city, start=1)}

# Print the encoding dictionary
print("Encoding Dictionary:", encoding_city_dict)

# Export the encoding to a JSON file in the "encoding" folder
json_file_path = os.path.join(encoding_folder, 'encoding_city_dict.json')
with open(json_file_path, 'w') as json_file:
    json.dump(encoding_city_dict, json_file, indent=4)

print(f"Dictionary has been exported to {json_file_path}")

# Step 4: Encode the "city" column using the encoding dictionary
df['City'] = df['City'].map(encoding_city_dict)

df

Encoding Dictionary: {'Jakarta Selatan': 1, 'Jakarta Utara': 2, 'Jakarta Pusat': 3, 'Jakarta Barat': 4, 'Jakarta Timur': 5, 'Tangerang': 6, 'Depok': 7, 'Bogor': 8, 'Bekasi': 9}
Dictionary has been exported to encoding\encoding_city_dict.json


Unnamed: 0,District,City,Latitude,Longitude,Price,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage
0,294,9,-6.227721,107.001649,3.450000e+08,60.0,40.0,1.0,2.0,1.0,1.0
1,211,9,-6.296441,106.921566,1.265000e+09,90.0,110.0,2.0,3.0,3.0,2.0
2,273,9,-6.211765,107.003851,1.300000e+09,80.0,120.0,2.0,3.0,3.0,2.0
3,267,9,-6.365165,107.182391,1.090000e+09,98.0,102.0,2.0,4.0,2.0,2.0
4,275,9,-6.303947,107.026896,8.000000e+08,96.0,75.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
10176,21,2,-6.107730,106.791458,1.850000e+10,583.0,1300.0,1.0,5.0,6.0,1.0
10177,60,1,-6.277783,106.811422,5.400000e+09,156.0,262.0,4.0,4.0,3.0,2.0
10178,171,4,-6.170157,106.762833,1.700000e+09,90.0,120.0,2.0,2.0,2.0,1.0
10179,173,4,-6.164927,106.705773,2.800000e+09,120.0,136.0,2.0,3.0,3.0,2.0


# Outlier

In [6]:
df

Unnamed: 0,District,City,Latitude,Longitude,Price,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage
0,294,9,-6.227721,107.001649,3.450000e+08,60.0,40.0,1.0,2.0,1.0,1.0
1,211,9,-6.296441,106.921566,1.265000e+09,90.0,110.0,2.0,3.0,3.0,2.0
2,273,9,-6.211765,107.003851,1.300000e+09,80.0,120.0,2.0,3.0,3.0,2.0
3,267,9,-6.365165,107.182391,1.090000e+09,98.0,102.0,2.0,4.0,2.0,2.0
4,275,9,-6.303947,107.026896,8.000000e+08,96.0,75.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
10176,21,2,-6.107730,106.791458,1.850000e+10,583.0,1300.0,1.0,5.0,6.0,1.0
10177,60,1,-6.277783,106.811422,5.400000e+09,156.0,262.0,4.0,4.0,3.0,2.0
10178,171,4,-6.170157,106.762833,1.700000e+09,90.0,120.0,2.0,2.0,2.0,1.0
10179,173,4,-6.164927,106.705773,2.800000e+09,120.0,136.0,2.0,3.0,3.0,2.0


In [7]:
# # Define the columns from which to drop outliers
# columns = ['Land_Size_M2', 'Building_Size_M2', 'Floors', 'Bedrooms','Bathrooms','Price']

# # Function to remove outliers based on IQR method
# def remove_outliers(df, columns):
#     for column in columns:
#         Q1 = df[column].quantile(0.25)
#         Q3 = df[column].quantile(0.75)
#         IQR = Q3 - Q1
#         lower_bound = Q1 - 1.5 * IQR
#         upper_bound = Q3 + 1.5 * IQR
#         df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
#     return df

# # Remove outliers from the specified columns
# df = remove_outliers(df, columns)

# df

# Normalization

Using the max-min normalization

In [8]:
# Compute maximum and minimum values for each column
max_values = df.max()
min_values = df.min()

# Construct dictionary with maximum and minimum values for each column
result_dict = {}
for column in df.columns:
  if(column == "Price"):
    continue
  result_dict[f"{column}_max"] = max_values[column]
  result_dict[f"{column}_min"] = min_values[column]

# Specify the file path where you want to save the JSON
json_file_path = os.path.join(encoding_folder, 'max_min_values.json')

# Write the dictionary to a JSON file
with open(json_file_path, 'w') as json_file:
  json.dump(result_dict, json_file, indent=4)

print(f"Dictionary has been exported to {json_file_path}")

Dictionary has been exported to encoding\max_min_values.json


In [9]:
# Function to normalize a column using min-max scaling
def min_max_normalize(column):
    min_val = column.min()
    max_val = column.max()
    normalized_column = (column - min_val) / (max_val - min_val)
    return normalized_column

# Apply normalization to all columns except "price_in_rp"
normalized_columns = df.drop(columns=['Price']).apply(min_max_normalize)

# Add the "price_in_rp" column back to the normalized DataFrame
df = pd.concat([normalized_columns, df['Price']], axis=1)

df

Unnamed: 0,District,City,Latitude,Longitude,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage,Price
0,0.901538,1.000,0.157189,0.994624,0.004161,0.004279,0.000000,0.018182,0.000000,0.066667,3.450000e+08
1,0.646154,1.000,0.147901,0.993879,0.007283,0.020045,0.076923,0.036364,0.036364,0.133333,1.265000e+09
2,0.836923,1.000,0.159346,0.994644,0.006242,0.022297,0.076923,0.036364,0.036364,0.133333,1.300000e+09
3,0.818462,1.000,0.138611,0.996304,0.008115,0.018243,0.076923,0.054545,0.018182,0.133333,1.090000e+09
4,0.843077,1.000,0.146886,0.994858,0.007907,0.012162,0.000000,0.036364,0.000000,0.066667,8.000000e+08
...,...,...,...,...,...,...,...,...,...,...,...
10176,0.061538,0.125,0.173409,0.992670,0.058573,0.288063,0.000000,0.072727,0.090909,0.066667,1.850000e+10
10177,0.181538,0.000,0.150422,0.992855,0.014149,0.054279,0.230769,0.054545,0.036364,0.133333,5.400000e+09
10178,0.523077,0.375,0.164970,0.992403,0.007283,0.022297,0.076923,0.018182,0.018182,0.066667,1.700000e+09
10179,0.529231,0.375,0.165677,0.991873,0.010404,0.025901,0.076923,0.036364,0.036364,0.133333,2.800000e+09


# Data Split

In [10]:
# from sklearn.model_selection import StratifiedShuffleSplit

# # Initialize the StratifiedShuffleSplit
# strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# # Split the DataFrame
# for train_index, test_index in strat_split.split(df, df['City']):
#     train_df = df.iloc[train_index]
#     test_df = df.iloc[test_index]

# df = df.drop(['District','City'],axis=1)

# # Define the feature columns (X) and target column (y)
# # Replace 'target_column' with the actual name of your target column
# X = df.drop('Price', axis=1)  # Features
# y = df['Price']  # Target

# y = y/1000000000

# # Perform the train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Define the bins and labels
bins = [
    0, 250e6, 500e6, 750e6, 1e9, 
    1.5e9, 2e9, 2.5e9, 3e9, 
    4e9, 5e9, float('inf')
]

labels = [
    '0-250 juta', '250-500 juta', '500-750 juta', '750 juta-1 miliar',
    '1-1.5 miliar', '1.5-2 miliar', '2-2.5 miliar', '2.5-3 miliar',
    '3-4 miliar', '4-5 miliar', 'lebih dari 5 miliar'
]

# Create a dictionary mapping numerical values to labels
mapping = {i: label for i, label in enumerate(labels)}

keys_list = list(mapping.keys())

# Convert the dictionary to a JSON string
mapping_json = json.dumps(mapping, indent=4, ensure_ascii=False)

# Print the JSON string
print(mapping_json)

# Optionally, save the JSON string to a file
with open('encoding/price_category_mapping.json', 'w', encoding='utf-8') as f:
    f.write(mapping_json)

# Categorize the 'Price' column
df['Price'] = pd.cut(df['Price'], bins=bins, labels=keys_list, right=False)

from sklearn.model_selection import train_test_split

# Initialize empty DataFrames for train and test
train_df = pd.DataFrame()
test_df = pd.DataFrame()

# Iterate over each unique city
for city in df['City'].unique():
    # Filter the DataFrame for the current city
    city_df = df[df['City'] == city]
    
    # Perform the train-test split for the current city
    city_train, city_test = train_test_split(city_df, test_size=0.2, random_state=42)
    
    # Append the split data to the respective DataFrames
    train_df = pd.concat([train_df, city_train])
    test_df = pd.concat([test_df, city_test])


unique_values_counts = df['City'].value_counts()
print("\nAll set unique value counts\n", unique_values_counts)
unique_values_counts = train_df['City'].value_counts()
print("\nTrain set unique value counts\n", unique_values_counts)
unique_values_counts = test_df['City'].value_counts()
print("\nTest set unique value counts\n", unique_values_counts)

{
    "0": "0-250 juta",
    "1": "250-500 juta",
    "2": "500-750 juta",
    "3": "750 juta-1 miliar",
    "4": "1-1.5 miliar",
    "5": "1.5-2 miliar",
    "6": "2-2.5 miliar",
    "7": "2.5-3 miliar",
    "8": "3-4 miliar",
    "9": "4-5 miliar",
    "10": "lebih dari 5 miliar"
}

All set unique value counts
 City
1.000    1841
0.875    1678
0.750    1357
0.000    1240
0.625    1193
0.375     995
0.125     928
0.500     697
0.250     252
Name: count, dtype: int64

Train set unique value counts
 City
1.000    1472
0.875    1342
0.750    1085
0.000     992
0.625     954
0.375     796
0.125     742
0.500     557
0.250     201
Name: count, dtype: int64

Test set unique value counts
 City
1.000    369
0.875    336
0.750    272
0.000    248
0.625    239
0.375    199
0.125    186
0.500    140
0.250     51
Name: count, dtype: int64


In [12]:
train_df

Unnamed: 0,District,City,Latitude,Longitude,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage,Price
588,0.781538,1.000,0.145301,0.994624,0.008115,0.039414,0.076923,0.036364,0.018182,0.133333,6
949,0.766154,1.000,0.163480,0.994440,0.009571,0.016667,0.000000,0.036364,0.018182,0.066667,4
1631,0.836923,1.000,0.157189,0.994624,0.016646,0.035811,0.076923,0.054545,0.054545,0.133333,10
1719,0.827692,1.000,0.145776,0.994804,0.005202,0.007658,0.000000,0.018182,0.000000,0.066667,2
1338,0.766154,1.000,0.163480,0.994440,0.005618,0.020045,0.076923,0.054545,0.036364,0.066667,4
...,...,...,...,...,...,...,...,...,...,...,...
6726,0.495385,0.375,0.169174,0.992103,0.006866,0.017793,0.076923,0.018182,0.018182,0.000000,5
7503,0.495385,0.375,0.169476,0.992079,0.002601,0.015541,0.076923,0.018182,0.018182,0.000000,4
9574,0.166154,0.375,0.168310,0.992709,0.008115,0.082658,0.307692,0.290909,0.290909,0.133333,10
8010,0.301538,0.375,0.164346,0.992749,0.027778,0.051577,0.076923,0.036364,0.036364,0.200000,10


In [13]:
test_df

Unnamed: 0,District,City,Latitude,Longitude,Land_Size_M2,Building_Size_M2,Floors,Bedrooms,Bathrooms,Carport/Garage,Price
1557,0.846154,1.000,0.143919,0.995776,0.002705,0.014189,0.000000,0.018182,0.018182,0.066667,2
1157,0.769231,1.000,0.156711,0.994235,0.012068,0.044369,0.076923,0.036364,0.036364,0.066667,7
352,0.781538,1.000,0.145301,0.994624,0.008115,0.039414,0.076923,0.054545,0.036364,0.066667,6
1018,0.843077,1.000,0.147941,0.994808,0.005410,0.007658,0.000000,0.018182,0.000000,0.066667,2
1394,0.836923,1.000,0.159346,0.994644,0.023928,0.022297,0.000000,0.036364,0.036364,0.200000,5
...,...,...,...,...,...,...,...,...,...,...,...
8174,0.449231,0.375,0.163093,0.992452,0.010092,0.079054,0.153846,0.054545,0.054545,0.066667,10
8046,0.387692,0.375,0.164702,0.992447,0.003121,0.022297,0.153846,0.036364,0.036364,0.000000,5
6540,0.236923,0.375,0.161708,0.992687,0.021432,0.080405,0.153846,0.163636,0.163636,0.000000,10
6961,0.449231,0.375,0.163093,0.992452,0.001561,0.019369,0.153846,0.036364,0.036364,0.000000,4


In [14]:
X_train = train_df.drop(['Price'],axis=1)
y_train = train_df['Price']
# y_train = y_train/1000000000

In [15]:
X_test = test_df.drop(['Price'],axis=1)
y_test = test_df['Price']
# y_test = y_test/1000000000

# Machine Learning

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10181 entries, 0 to 10180
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   District          10181 non-null  float64 
 1   City              10181 non-null  float64 
 2   Latitude          10181 non-null  float64 
 3   Longitude         10181 non-null  float64 
 4   Land_Size_M2      10181 non-null  float64 
 5   Building_Size_M2  10181 non-null  float64 
 6   Floors            10181 non-null  float64 
 7   Bedrooms          10181 non-null  float64 
 8   Bathrooms         10181 non-null  float64 
 9   Carport/Garage    10181 non-null  float64 
 10  Price             10181 non-null  category
dtypes: category(1), float64(10)
memory usage: 805.8 KB


In [17]:
# %pip install tensorflow-ranking

In [18]:
# from sklearn.metrics import r2_score

# # Define the input shape
# input_shape = X_train.shape[1:]

# print(input_shape)

# # Define the model architecture with increased complexity
# model = models.Sequential([
#     layers.Dense(1024, activation="relu", input_shape=input_shape),
#     layers.Reshape((1024, 1)),  # Reshape to (sequence_length, features)
#     layers.Conv1D(filters=128, kernel_size=3, strides=1, activation="relu", padding='causal'),
#     layers.MaxPooling1D(2, 2),
#     layers.Conv1D(filters=64, kernel_size=3, strides=1, activation="relu", padding='causal'),
#     layers.MaxPooling1D(2, 2),
#     layers.Conv1D(filters=32, kernel_size=3, strides=1, activation="relu", padding='causal'),
#     layers.MaxPooling1D(2, 2),
#     layers.Flatten(),
#     layers.Dense(512, activation="relu"),
#     layers.Dense(1)
# ])

# # Compile the model with a lower learning rate
# optimizer = tf.keras.optimizers.Adam()  # Adjust learning rate
# model.compile(optimizer=optimizer, loss='mse', metrics=[tf.keras.metrics.R2Score()])

# # Train the model with more epochs
# history = model.fit(X_train, y_train, epochs=10000, verbose=1, validation_data=(X_test,y_test))

# # Calculate R-squared (R2) score
# y_train_pred = model.predict(X_train)
# train_r2_score = r2_score(y_train, y_train_pred)

# print("Training R2 score:", train_r2_score)

In [19]:
# from tensorflow.keras import models, layers
# from sklearn.metrics import accuracy_score
# import tensorflow as tf
# import tensorflow_ranking as tfr

# # Define the input shape
# input_shape = X_train.shape[1:]

# # Define the model architecture for classification with 21 classes
# model = models.Sequential([
#     layers.Dense(1024, activation="relu", input_shape=input_shape),
#     layers.Dense(21, activation="softmax")
# ])


# import tensorflow as tf

# def ordinal_loss(y_true, y_pred):
#     """
#     Ordinal loss function for ordinal regression.

#     Arguments:
#     y_true -- True labels, shape (batch_size, num_classes - 1)
#               The last class is reserved for values above the last threshold.
#     y_pred -- Predicted labels, shape (batch_size, num_classes - 1)

#     Returns:
#     loss -- Ordinal loss value
#     """

#     # Cast to float32 to ensure compatibility
#     y_true = tf.cast(y_true, dtype=tf.float32)
#     y_pred = tf.cast(y_pred, dtype=tf.float32)

#     # Ordinal loss computation
#     error = tf.subtract(y_true, y_pred)
#     squared_error = tf.square(error)
#     loss = tf.reduce_sum(squared_error, axis=1)  # Sum over the thresholds

#     return tf.reduce_mean(loss)

# # Example usage
# # Assuming y_true and y_pred are tensors of shape (batch_size, num_classes - 1)
# # where num_classes is the number of ordinal categories

# # loss = ordinal_loss(y_true, y_pred)


# # Compile the model with a lower learning rate
# optimizer = tf.keras.optimizers.Adam()  # Adjust learning rate as needed
# # loss = tfr.keras.losses.OrdinalLoss(ordinal_size=21)
# model.compile(optimizer=optimizer, loss=ordinal_loss, metrics=[tf.keras.metrics.RootMeanSquaredError()])

# # Train the model with more epochs
# history = model.fit(X_train, y_train, epochs=1000, verbose=1, validation_data=(X_test, y_test))

# # Calculate accuracy on the training set
# y_test_pred = model.predict(X_test)
# y_test_pred_classes = y_test_pred.argmax(axis=1)
# test_accuracy = accuracy_score(y_test, y_test_pred_classes)

# print("Testing accuracy:", test_accuracy)

# # Alternatively, you can save the entire model (architecture + weights) to a single H5 file
# model.save("model/model_h5.h5")

# print(y_test_pred)
# print(y_test)

# unique_values = np.unique(y_test)
# print("Unique values in y_test:", unique_values)

# print("Shape of y_test_pred:", y_test_pred.shape)
# print("Shape of y_test:", y_test.shape)

In [20]:
from tensorflow.keras import models, layers
import tensorflow as tf
from sklearn.metrics import mean_absolute_error

def custom_round(x):
    return tf.where(x - tf.math.floor(x) < 0.5, tf.math.floor(x), tf.math.ceil(x))

input_shape = X_train.shape[1:]

model = models.Sequential([
    layers.Dense(1024, activation="relu", input_shape=input_shape),
    layers.Dense(512, activation="relu"),
    layers.Dense(256, activation="relu"),
    layers.Dense(128, activation="relu"),
    layers.Dense(1),
])

optimizer = tf.keras.optimizers.Adam()
model.compile(optimizer=optimizer, loss='mae', metrics=['mae'])

history = model.fit(X_train, y_train, epochs=1000, verbose=1, validation_data=(X_test, y_test))

# Make predictions
y_test_pred = model.predict(X_test)

# Apply custom rounding to the predictions
y_test_pred_rounded = tf.cast(custom_round(y_test_pred), dtype=tf.int32).numpy()

print("Test Rounded:",y_test_pred_rounded)

# Evaluate the MAE using scikit-learn
test_mae = mean_absolute_error(y_test, y_test_pred_rounded)
print("Testing MAE:", test_mae)

# Make predictions on training data
y_train_pred = model.predict(X_train)

# Apply custom rounding to the predictions
y_train_pred_rounded = tf.cast(custom_round(y_train_pred), dtype=tf.int32).numpy()

print("Train Rounded:",y_train_pred_rounded)

# Evaluate the MAE using scikit-learn
train_mae = mean_absolute_error(y_train, y_train_pred_rounded)
print("Training MAE:", train_mae)


Epoch 1/1000


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/100

In [24]:
# Alternatively, you can save the entire model (architecture + weights) to a single H5 file
model.save("model_h5/model.h5")

In [None]:
# Save the entire model in SavedModel format
model.save('model_tfserving')