In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [31]:
# Read the TSV file
df = pd.read_csv('dong.csv', delimiter=',', encoding='euc-kr')

# Read the 2nd TSV file
df2 = pd.read_csv('dong2.csv', delimiter=',', encoding='utf-8')

# Append the additional data to the existing data
df = pd.concat([df, df2], ignore_index=True)

# Select the required columns
df = df[['기준일ID', '시간대구분', '총생활인구수', '행정동코드']]

# Rename the columns for easier understanding
df.rename(columns={
    '기준일ID': 'date',
    '시간대구분': 'hour',
    '총생활인구수': 'total_population',
    '행정동코드': 'dong_code'
}, inplace=True)

# Filter the dataframe based on the 'dong' code
dong_code = '11710650'  # Replace this with the dong code you want to use
df = df[df['dong_code'].astype(str) == dong_code]

# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

# Convert 'date' column to Unix timestamp (int64 first then division)
df['date'] = df['date'].astype('int64') // 10**9

# Extract the day of the week and create a new column 'DayOfWeek'
df['DayOfWeek'] = pd.to_datetime(df['date'], unit='s').dt.day_name()

# One-hot encode the 'DayOfWeek' column
df = pd.get_dummies(df, columns=['DayOfWeek'])

# One-hot encode the 'hour' column
df = pd.get_dummies(df, columns=['hour'])

# Create a new column 'IsWeekend'
df['IsWeekend'] = ((df['DayOfWeek_Saturday'] == 1) | (df['DayOfWeek_Sunday'] == 1)).astype(int)

# Convert 'date' column back to datetime format
df['date'] = pd.to_datetime(df['date'], unit='s')

# Define the specific holidays
holidays = ['2023-01-01','2023-01-23','2023-01-24','2023-03-01','2023-05-05', '2023-05-29', '2023-06-06']

# Update 'IsWeekend' column to include the holidays
df.loc[df['date'].dt.strftime('%Y-%m-%d').isin(holidays), 'IsWeekend'] = 1

# Convert 'date' column back to Unix timestamp
df['date'] = df['date'].astype('int64') // 10**9


In [32]:
print(df.head())
print(df.info())
print(len(df.index))

            date  total_population  dong_code  DayOfWeek_Friday   
400   1687996800        38120.9302   11710650             False  \
824   1687996800        37186.7679   11710650             False   
1248  1687996800        37093.6260   11710650             False   
1672  1687996800        37101.1457   11710650             False   
2096  1687996800        36992.8456   11710650             False   

      DayOfWeek_Monday  DayOfWeek_Saturday  DayOfWeek_Sunday   
400              False               False             False  \
824              False               False             False   
1248             False               False             False   
1672             False               False             False   
2096             False               False             False   

      DayOfWeek_Thursday  DayOfWeek_Tuesday  DayOfWeek_Wednesday  ...   
400                 True              False                False  ...  \
824                 True              False                False  

In [33]:
from sklearn.model_selection import train_test_split

# Our target variable is 'total_population'
X = df.drop('total_population', axis=1)
y = df['total_population']

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=47)


In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# Define the parameter grid
param_grid = {
    'max_depth': [10, 20, 30, 40, 50],
     'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'criterion': ['absolute_error','friedman_mse','squared_error']
}

# Create a base model
dt = DecisionTreeRegressor(random_state=47)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

print("Best parameters: ", best_params)

Fitting 3 folds for each of 135 candidates, totalling 405 fits
Best parameters:  {'criterion': 'friedman_mse', 'max_depth': 30, 'min_samples_leaf': 4, 'min_samples_split': 8}


In [36]:
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import mean_squared_error
from joblib import dump, load

# Train the final model with the best parameters found
model = DecisionTreeRegressor(max_depth=best_params['max_depth'], 
                              criterion=best_params['criterion'])

# Train the model using the training data
model.fit(X_train, y_train)

# Save model
dump(model, 'model.joblib')

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the mean squared error of our predictions
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Perform cross-validation
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
avg_mse = np.mean(-scores)
print(f"Average MSE with 5-fold cross-validation: {avg_mse}")

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Absolute Percentage Error: {mape}%")

# Print feature importances
importances = model.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns).sort_values(ascending=False)
print("\nFeature importances:")
print(feature_importances)



Mean Squared Error: 691619.7238237985
Average MSE with 5-fold cross-validation: 778788.9537372834
Mean Absolute Percentage Error: 1.629352283891124%

Feature importances:
date                   0.143606
hour_9                 0.082484
hour_14                0.077685
hour_11                0.071014
hour_8                 0.070264
hour_10                0.065000
IsWeekend              0.064174
hour_15                0.064143
hour_12                0.062472
hour_13                0.059788
hour_16                0.037846
hour_7                 0.026005
hour_17                0.022793
hour_20                0.018598
hour_19                0.016306
hour_21                0.015651
hour_23                0.013055
DayOfWeek_Sunday       0.012974
hour_18                0.011671
DayOfWeek_Friday       0.010217
DayOfWeek_Saturday     0.010098
hour_6                 0.009650
hour_22                0.007040
DayOfWeek_Tuesday      0.006520
hour_0                 0.006066
hour_1                 0.0037

In [37]:
import time
# Get the feature names from the training data
feature_names = X_train.columns.tolist()

# Create a list to store the predictions
predictions = []

# Get the current date and hour
current_date = pd.to_datetime(time.time(), unit='s')
current_date = current_date.tz_localize('UTC').tz_convert('Asia/Seoul')
current_hour = current_date.hour
print(current_hour)
print(current_date)

predicted_hours = 72
# Create new data points for the next x hours
for i in range(predicted_hours):
    new_data = pd.Series(0, index=feature_names)  # Initialize with zeros instead of NaNs
    new_data['date'] = (current_date + pd.DateOffset(hours=i)).strftime('%Y%m%d')
    
    # Set all hour features to 0
    for h in range(24):  # Change this to 24
        new_data['hour_' + str(h)] = 0
    new_data['hour_' + str((current_hour + i) % 24)] = 1  # Change this to 24

    # Set all DayOfWeek features to 0
    for dow in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
        new_data['DayOfWeek_' + dow] = 0

    # Determine the day of the week
    day_of_week = (current_date + pd.DateOffset(hours=i)).day_name()
    new_data['DayOfWeek_' + day_of_week] = 1

    # Determine whether it's a weekend
    new_data['IsWeekend'] = 1 if day_of_week in ['Saturday', 'Sunday'] else 0

    # Convert the Series to a DataFrame
    new_data_df = new_data.to_frame().transpose()

    # Make a prediction for the new data point
    new_pred = model.predict(new_data_df)
    predictions.append(new_pred)

print(f"Predicted population for the next {predicted_hours} hours: {predictions}")




19
2023-07-05 19:05:44.212500992+09:00
Predicted population for the next 72 hours: [array([38424.1645]), array([39163.75]), array([37819.0439]), array([37464.5432]), array([38081.4459]), array([37130.5596]), array([36310.77563846]), array([36310.77563846]), array([36048.48155]), array([35977.73556667]), array([35793.4933]), array([35068.2997]), array([34136.9222]), array([32055.8121]), array([31372.7195]), array([32139.5563]), array([31363.1022]), array([32013.7091]), array([32136.949]), array([31926.4207]), array([32507.5426]), array([33666.3713]), array([34580.6158]), array([37189.20126667]), array([38196.4228]), array([39163.75]), array([37819.0439]), array([37591.6405]), array([38081.4459]), array([37130.5596]), array([36310.77563846]), array([36310.77563846]), array([36048.48155]), array([35977.73556667]), array([35793.4933]), array([35181.2191]), array([34136.9222]), array([32445.4723]), array([31534.4369]), array([32139.5563]), array([31814.4316]), array([32013.7091]), array([33

In [38]:
import math
for index, element in enumerate(predictions):
    future_date = current_date + pd.DateOffset(hours=index + 1)
    future_hour = future_date.hour
    future_day = future_date.day_name()
    print(f"Predicted population for {future_date.strftime('%Y-%m-%d')} ({future_day}) at hour {future_hour}: {math.ceil(element[0])}")


Predicted population for 2023-07-05 (Wednesday) at hour 20: 38425
Predicted population for 2023-07-05 (Wednesday) at hour 21: 39164
Predicted population for 2023-07-05 (Wednesday) at hour 22: 37820
Predicted population for 2023-07-05 (Wednesday) at hour 23: 37465
Predicted population for 2023-07-06 (Thursday) at hour 0: 38082
Predicted population for 2023-07-06 (Thursday) at hour 1: 37131
Predicted population for 2023-07-06 (Thursday) at hour 2: 36311
Predicted population for 2023-07-06 (Thursday) at hour 3: 36311
Predicted population for 2023-07-06 (Thursday) at hour 4: 36049
Predicted population for 2023-07-06 (Thursday) at hour 5: 35978
Predicted population for 2023-07-06 (Thursday) at hour 6: 35794
Predicted population for 2023-07-06 (Thursday) at hour 7: 35069
Predicted population for 2023-07-06 (Thursday) at hour 8: 34137
Predicted population for 2023-07-06 (Thursday) at hour 9: 32056
Predicted population for 2023-07-06 (Thursday) at hour 10: 31373
Predicted population for 2023-0