In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [21]:
import pandas as pd

# Read the TSV file
df = pd.read_csv('E:\CJH\Workspace\CrowdPredict\dong.csv', delimiter=',', encoding='euc-kr')

dongCode = '11110515'
df = df[df['행정동코드'].astype(str) == dongCode]

# Select the required columns
df = df[['기준일ID', '시간대구분', '총생활인구수']]

# Rename the columns for easier understanding
df.rename(columns={
    '기준일ID': 'date',
    '시간대구분': 'hour',
    '총생활인구수': 'total_population'
}, inplace=True)

# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

# Convert 'date' column to Unix timestamp (int64 first then division)
df['date'] = df['date'].astype('int64') // 10**9

# Extract the day of the week and create a new column 'DayOfWeek'
df['DayOfWeek'] = pd.to_datetime(df['date'], unit='s').dt.day_name()

# One-hot encode the 'DayOfWeek' column
df = pd.get_dummies(df, columns=['DayOfWeek'])

# One-hot encode the 'hour' column
df = pd.get_dummies(df, columns=['hour'])

# Create a new column 'IsWeekend'
df['IsWeekend'] = ((df['DayOfWeek_Saturday'] == 1) | (df['DayOfWeek_Sunday'] == 1)).astype(int)

# Convert 'date' column back to datetime format
df['date'] = pd.to_datetime(df['date'], unit='s')

# Define the specific holidays
holidays = ['2023-05-05', '2023-05-29', '2023-06-06']

# Update 'IsWeekend' column to include the holidays
df.loc[df['date'].dt.strftime('%Y-%m-%d').isin(holidays), 'IsWeekend'] = 1

# Convert 'date' column back to Unix timestamp
df['date'] = df['date'].astype('int64') // 10**9


In [None]:
print(df.head())
print(df.info())

In [23]:
from sklearn.model_selection import train_test_split

# Our target variable is 'total_population'
X = df.drop('total_population', axis=1)
y = df['total_population']

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=47)


In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import mean_squared_error

# Create a Decision Tree Regression model
model = DecisionTreeRegressor(max_depth=10, min_samples_split=10, random_state=47)

# Train the model using the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the mean squared error of our predictions
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Perform cross-validation
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
avg_mse = np.mean(-scores)
print(f"Average MSE with 5-fold cross-validation: {avg_mse}")

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Absolute Percentage Error: {mape}%")

# Print feature importances
importances = model.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns).sort_values(ascending=False)
print("\nFeature importances:")
print(feature_importances)



Mean Squared Error: 1836579.9591272639
Average MSE with 5-fold cross-validation: 1844481.2410134263

Feature importances:
hour_12                0.125175
hour_14                0.124104
hour_11                0.120227
hour_13                0.117659
hour_10                0.110623
hour_15                0.101470
date                   0.066741
hour_9                 0.064148
hour_16                0.048187
IsWeekend              0.044277
hour_8                 0.032905
hour_17                0.022192
DayOfWeek_Tuesday      0.008478
DayOfWeek_Saturday     0.005742
DayOfWeek_Sunday       0.003955
DayOfWeek_Wednesday    0.002459
DayOfWeek_Monday       0.001143
DayOfWeek_Thursday     0.000342
DayOfWeek_Friday       0.000170
hour_0                 0.000000
hour_1                 0.000000
hour_3                 0.000000
hour_7                 0.000000
hour_6                 0.000000
hour_5                 0.000000
hour_4                 0.000000
hour_18                0.000000
hour_19       

In [26]:
# Get the feature names from the training data
feature_names = X_train.columns.tolist()

# Create a new data point with the same features as the training data
new_data = pd.Series(index=feature_names)

# Assign values to the features in the new data point
new_data['date'] = 1687996800  # Unix timestamp
new_data['DayOfWeek_Monday'] = 0
new_data['DayOfWeek_Tuesday'] = 0
new_data['DayOfWeek_Wednesday'] = 0
new_data['DayOfWeek_Thursday'] = 0
new_data['DayOfWeek_Friday'] = 1
new_data['DayOfWeek_Saturday'] = 0
new_data['DayOfWeek_Sunday'] = 0
new_data['hour_0'] = 0
new_data['hour_1'] = 0
new_data['hour_2'] = 0
new_data['hour_3'] = 0
new_data['hour_4'] = 0
new_data['hour_5'] = 0
new_data['hour_6'] = 0
new_data['hour_7'] = 0
new_data['hour_8'] = 0
new_data['hour_9'] = 0
new_data['hour_10'] = 0
new_data['hour_11'] = 0
new_data['hour_12'] = 0
new_data['hour_13'] = 0
new_data['hour_14'] = 0
new_data['hour_15'] = 0
new_data['hour_16'] = 0
new_data['hour_17'] = 0
new_data['hour_18'] = 0
new_data['hour_19'] = 1
new_data['hour_20'] = 0
new_data['hour_21'] = 0
new_data['hour_22'] = 0
new_data['hour_23'] = 0
new_data['IsWeekend'] = 0

# Convert the Series to a DataFrame
new_data_df = new_data.to_frame().transpose()

# Make a prediction for the new data point
new_pred = model.predict(new_data_df)

print(f"Predicted population: {new_pred[0]}")


Predicted population: 15705.880404510082
