In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print(pd.__version__)

2.0.1


In [3]:
import chardet

# Detect the encoding of the file
with open('dong2.csv', 'rb') as f:
    result = chardet.detect(f.read())

print(result['encoding'])

# Then use the detected encoding to read the file
df2 = pd.read_csv('dong2.csv', delimiter=',', encoding=result['encoding'])

ModuleNotFoundError: No module named 'chardet'

In [32]:
# Read the TSV file
df = pd.read_csv('dong.csv', delimiter=',', encoding='euc-kr')

# Read the 2nd TSV file
df2 = pd.read_csv('dong2.csv', delimiter=',', encoding='utf-8')

# Append the additional data to the existing data
df = pd.concat([df, df2], ignore_index=True)

# Select the required columns
df = df[['기준일ID', '시간대구분', '총생활인구수', '행정동코드']]

# Rename the columns for easier understanding
df.rename(columns={
    '기준일ID': 'date',
    '시간대구분': 'hour',
    '총생활인구수': 'total_population',
    '행정동코드': 'dong_code'
}, inplace=True)

# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

# Convert 'date' column to Unix timestamp (int64 first then division)
df['date'] = df['date'].astype('int64') // 10**9

# Extract the day of the week and create a new column 'DayOfWeek'
df['DayOfWeek'] = pd.to_datetime(df['date'], unit='s').dt.day_name()

# One-hot encode the 'DayOfWeek' column
df = pd.get_dummies(df, columns=['DayOfWeek'])

# One-hot encode the 'hour' column
df = pd.get_dummies(df, columns=['hour'])

# One-hot encode the 'dong_code' column
df = pd.get_dummies(df, columns=['dong_code'])

# Create a new column 'IsWeekend'
df['IsWeekend'] = ((df['DayOfWeek_Saturday'] == 1) | (df['DayOfWeek_Sunday'] == 1)).astype(int)

# Convert 'date' column back to datetime format
df['date'] = pd.to_datetime(df['date'], unit='s')

# Define the specific holidays
holidays = ['2023-01-01','2023-01-23','2023-01-24','2023-03-01','2023-05-05', '2023-05-29', '2023-06-06']

# Update 'IsWeekend' column to include the holidays
df.loc[df['date'].dt.strftime('%Y-%m-%d').isin(holidays), 'IsWeekend'] = 1

# Convert 'date' column back to Unix timestamp
df['date'] = df['date'].astype('int64') // 10**9


In [33]:
print(df.head())
print(df.info())
print(len(df.index))

         date  total_population  DayOfWeek_Friday  DayOfWeek_Monday   
0  1687996800        15423.2230             False             False  \
1  1687996800        13110.2144             False             False   
2  1687996800         3755.5950             False             False   
3  1687996800        13896.7643             False             False   
4  1687996800        18671.3496             False             False   

   DayOfWeek_Saturday  DayOfWeek_Sunday  DayOfWeek_Thursday   
0               False             False                True  \
1               False             False                True   
2               False             False                True   
3               False             False                True   
4               False             False                True   

   DayOfWeek_Tuesday  DayOfWeek_Wednesday  hour_0  ...  dong_code_11740600   
0              False                False    True  ...               False  \
1              False                Fa

In [34]:
from sklearn.model_selection import train_test_split

# Our target variable is 'total_population'
X = df.drop('total_population', axis=1)
y = df['total_population']

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=47)


In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# Define the parameter grid
param_grid = {
    'max_depth': [10, 20, 30, 40, 50],
     'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'criterion': ['absolute_error','friedman_mse','squared_error']
}

# Create a base model
dt = DecisionTreeRegressor(random_state=47)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

print("Best parameters: ", best_params)

Fitting 3 folds for each of 135 candidates, totalling 405 fits


In [29]:
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import mean_squared_error
from joblib import dump, load

# Train the final model with the best parameters found
model = DecisionTreeRegressor(max_depth=best_params['max_depth'], 
                              criterion=best_params['criterion'])

# Train the model using the training data
model.fit(X_train, y_train)

# Save model
dump(model, 'model.joblib')

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the mean squared error of our predictions
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Perform cross-validation
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
avg_mse = np.mean(-scores)
print(f"Average MSE with 5-fold cross-validation: {avg_mse}")

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Absolute Percentage Error: {mape}%")

# Print feature importances
importances = model.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns).sort_values(ascending=False)
print("\nFeature importances:")
print(feature_importances)



Mean Squared Error: 31571610.033992548
Average MSE with 5-fold cross-validation: 34788283.323277056
Mean Absolute Percentage Error: 2.862414224988783%

Feature importances:
hour_16                0.077393
hour_14                0.076705
hour_13                0.076067
hour_15                0.075589
hour_10                0.075450
hour_11                0.074455
IsWeekend              0.074191
hour_12                0.071904
hour_9                 0.070707
hour_17                0.062349
hour_8                 0.050886
date                   0.047483
hour_18                0.045486
hour_19                0.027146
hour_7                 0.023272
hour_20                0.017172
hour_21                0.010792
DayOfWeek_Friday       0.010518
hour_6                 0.006954
hour_22                0.006868
DayOfWeek_Sunday       0.005557
DayOfWeek_Monday       0.003912
hour_23                0.001727
DayOfWeek_Thursday     0.001573
DayOfWeek_Wednesday    0.001524
DayOfWeek_Tuesday      0.00

In [30]:
import time
# Get the feature names from the training data
feature_names = X_train.columns.tolist()

# Create a list to store the predictions
predictions = []


# Get the current date and hour
current_date = pd.to_datetime(time.time(), unit='s')
current_hour = current_date.hour


# Create new data points for the next 24 hours
for i in range(24):
    new_data = pd.Series(index=feature_names)
    new_data['date'] = (current_date + pd.DateOffset(hours=i)).strftime('%Y%m%d')
    
    # Set all hour features to 0
    for h in range(24):
        new_data['hour_' + str(h)] = 0
    new_data['hour_' + str((current_hour + i) % 24)] = 1  # Set the corresponding hour feature to 1

    # Set all DayOfWeek features to 0
    for dow in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
        new_data['DayOfWeek_' + dow] = 0

    # Determine the day of the week
    day_of_week = (current_date + pd.DateOffset(hours=i)).day_name()
    new_data['DayOfWeek_' + day_of_week] = 1

    # Determine whether it's a weekend
    new_data['IsWeekend'] = 1 if day_of_week in ['Saturday', 'Sunday'] else 0

    # Convert the Series to a DataFrame
    new_data_df = new_data.to_frame().transpose()

    # Make a prediction for the new data point
    new_pred = model.predict(new_data_df)
    predictions.append(new_pred)


print(f"Predicted population for the next 24 hours: {predictions}")


Predicted population for the next 24 hours: [array([34713.8528]), array([35913.4222]), array([37023.1308]), array([41683.2557]), array([52981.1075]), array([84004.9762]), array([101906.5143]), array([108714.8055]), array([114138.4549]), array([115486.343]), array([116039.5069]), array([114379.512]), array([112631.0905]), array([109423.6606]), array([98534.5294]), array([73978.7767]), array([57691.1701]), array([49381.8492]), array([44234.5094]), array([41141.1275]), array([36234.7926]), array([35362.2924]), array([35348.7667]), array([34862.9187])]


In [31]:
import math
for index, element in enumerate(predictions):
    print(f"Predicted population for {index + 1} hour(s) ahead: {math.ceil(element[0])}")


Predicted population for 1 hour(s) ahead: 34714
Predicted population for 2 hour(s) ahead: 35914
Predicted population for 3 hour(s) ahead: 37024
Predicted population for 4 hour(s) ahead: 41684
Predicted population for 5 hour(s) ahead: 52982
Predicted population for 6 hour(s) ahead: 84005
Predicted population for 7 hour(s) ahead: 101907
Predicted population for 8 hour(s) ahead: 108715
Predicted population for 9 hour(s) ahead: 114139
Predicted population for 10 hour(s) ahead: 115487
Predicted population for 11 hour(s) ahead: 116040
Predicted population for 12 hour(s) ahead: 114380
Predicted population for 13 hour(s) ahead: 112632
Predicted population for 14 hour(s) ahead: 109424
Predicted population for 15 hour(s) ahead: 98535
Predicted population for 16 hour(s) ahead: 73979
Predicted population for 17 hour(s) ahead: 57692
Predicted population for 18 hour(s) ahead: 49382
Predicted population for 19 hour(s) ahead: 44235
Predicted population for 20 hour(s) ahead: 41142
Predicted population 