In [1]:
import pandas as pd
import numpy as np
import os
import missingno as msno
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from itertools import product
import itertools



In [2]:
# from tensorflow.python.client import device_lib

# device_lib.list_local_devices()

In [3]:
train = pd.read_csv('/kaggle/input/prediction-of-tourist-arrivals/train_df.csv')
test = pd.read_csv('/kaggle/input/prediction-of-tourist-arrivals/test_df.csv')

In [4]:
train

Unnamed: 0,id,date,tourist_area,spot_facility,tourist_arrivals,area,city,type,category,tourism_index,info,event,weather_index
0,0,2018/8/1,1,A,454,A,A,A,13,1750.0,A,A,40.05
1,1,2018/8/1,1,B,823,A,A,A,13,1750.0,A,A,40.05
2,2,2018/8/1,1,C,149,A,A,A,13,1750.0,A,A,40.05
3,3,2018/8/1,1,D,157,A,A,A,13,1750.0,A,A,40.05
4,4,2018/8/1,1,E,14,A,A,A,13,1750.0,A,A,40.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
132187,132187,2019/5/31,9,D,149,A,A,C,6,1998.0,A,A,48.29
132188,132188,2019/5/31,9,E,56,A,A,C,6,1998.0,A,A,48.29
132189,132189,2019/5/31,9,F,353,A,A,C,6,1998.0,A,A,48.29
132190,132190,2019/5/31,9,G,35,A,A,C,6,1998.0,A,A,48.29


In [5]:
test

Unnamed: 0,id,date,tourist_area,spot_facility,area,city,type,category,tourism_index,info,event,weather_index
0,145584,2019/7/1,1,A,A,A,A,13,1362,A,A,
1,145585,2019/7/1,1,B,A,A,A,13,1362,A,A,
2,145586,2019/7/1,1,C,A,A,A,13,1362,A,A,
3,145587,2019/7/1,1,D,A,A,A,13,1362,A,A,
4,145588,2019/7/1,1,E,A,A,A,13,1362,A,A,
...,...,...,...,...,...,...,...,...,...,...,...,...
13387,158971,2019/7/31,9,D,A,A,C,6,2040,A,A,50.21
13388,158972,2019/7/31,9,E,A,A,C,6,2040,A,A,50.21
13389,158973,2019/7/31,9,F,A,A,C,6,2040,A,A,50.21
13390,158974,2019/7/31,9,G,A,A,C,6,2040,A,A,50.21


In [6]:
train_y = train["tourist_arrivals"]
train_y

0         454
1         823
2         149
3         157
4          14
         ... 
132187    149
132188     56
132189    353
132190     35
132191    377
Name: tourist_arrivals, Length: 132192, dtype: int64

In [7]:
train = train.drop("tourist_arrivals", axis = 1)

In [8]:
combined = pd.concat([train, test], axis=0, ignore_index=True)

In [9]:
combined

Unnamed: 0,id,date,tourist_area,spot_facility,area,city,type,category,tourism_index,info,event,weather_index
0,0,2018/8/1,1,A,A,A,A,13,1750.0,A,A,40.05
1,1,2018/8/1,1,B,A,A,A,13,1750.0,A,A,40.05
2,2,2018/8/1,1,C,A,A,A,13,1750.0,A,A,40.05
3,3,2018/8/1,1,D,A,A,A,13,1750.0,A,A,40.05
4,4,2018/8/1,1,E,A,A,A,13,1750.0,A,A,40.05
...,...,...,...,...,...,...,...,...,...,...,...,...
145579,158971,2019/7/31,9,D,A,A,C,6,2040.0,A,A,50.21
145580,158972,2019/7/31,9,E,A,A,C,6,2040.0,A,A,50.21
145581,158973,2019/7/31,9,F,A,A,C,6,2040.0,A,A,50.21
145582,158974,2019/7/31,9,G,A,A,C,6,2040.0,A,A,50.21


In [10]:
combined.isnull().sum()

id                   0
date                 0
tourist_area         0
spot_facility        0
area                 0
city                 0
type                 0
category             0
tourism_index     3992
info                 0
event                0
weather_index    46224
dtype: int64

In [11]:
# пока без замены Nan, попробую встроенным в CatBoost
# columns_to_fill = ["tourism_index", "weather_index"]
# combined[columns_to_fill] = combined[columns_to_fill].fillna(combined[columns_to_fill].median())
# combined.isnull().sum()

In [12]:
categorialFeatures = ["spot_facility","area", "city", "type", "info", "event"]
unique_values = combined[categorialFeatures].apply(lambda x: x.nunique())
unique_values

spot_facility     8
area             22
city             16
type              5
info              4
event             7
dtype: int64

In [13]:
combined['date'] = pd.to_datetime(combined['date'])

In [14]:
combined['year'] = combined['date'].dt.year
combined['month'] = combined['date'].dt.month
combined['day'] = combined['date'].dt.day
combined['dayofweek'] = combined['date'].dt.dayofweek

In [15]:
def get_season(month):
    if month in [1, 2, 3]:
        return 'spring'
    elif month in [4, 5, 6]:
        return 'summer'
    elif month in [7, 8, 9]:
        return 'fall'
    else:
        return 'winter'
    
def get_season_num(month):
    if month in [1, 2, 3]:
        return 0
    elif month in [4, 5, 6]:
        return 1
    elif month in [7, 8, 9]:
        return 2
    else:
        return 3

In [16]:
combined['season'] = combined['month'].apply(get_season)
combined['season_num'] = combined['month'].apply(get_season_num)

In [17]:
combined['weekday'] = combined['dayofweek'] > 5

In [18]:
combined

Unnamed: 0,id,date,tourist_area,spot_facility,area,city,type,category,tourism_index,info,event,weather_index,year,month,day,dayofweek,season,season_num,weekday
0,0,2018-08-01,1,A,A,A,A,13,1750.0,A,A,40.05,2018,8,1,2,fall,2,False
1,1,2018-08-01,1,B,A,A,A,13,1750.0,A,A,40.05,2018,8,1,2,fall,2,False
2,2,2018-08-01,1,C,A,A,A,13,1750.0,A,A,40.05,2018,8,1,2,fall,2,False
3,3,2018-08-01,1,D,A,A,A,13,1750.0,A,A,40.05,2018,8,1,2,fall,2,False
4,4,2018-08-01,1,E,A,A,A,13,1750.0,A,A,40.05,2018,8,1,2,fall,2,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145579,158971,2019-07-31,9,D,A,A,C,6,2040.0,A,A,50.21,2019,7,31,2,fall,2,False
145580,158972,2019-07-31,9,E,A,A,C,6,2040.0,A,A,50.21,2019,7,31,2,fall,2,False
145581,158973,2019-07-31,9,F,A,A,C,6,2040.0,A,A,50.21,2019,7,31,2,fall,2,False
145582,158974,2019-07-31,9,G,A,A,C,6,2040.0,A,A,50.21,2019,7,31,2,fall,2,False


In [19]:
def encode(df, col):
    df[col + '_cos'] = np.cos(2 * np.pi * df[col] / df[col].max())
    df[col + '_sin'] = np.sin(2 * np.pi * df[col] / df[col].max())
    return df

In [20]:
combined = encode(combined, 'month')
combined = encode(combined, 'day')
combined = encode(combined, 'dayofweek')
combined = encode(combined, 'season_num')

In [21]:
combined = combined.drop(['date','id', 'month', 'day', 'dayofweek', 'season_num'], axis=1)

In [22]:
pd.options.display.max_columns = None
combined

Unnamed: 0,tourist_area,spot_facility,area,city,type,category,tourism_index,info,event,weather_index,year,season,weekday,month_cos,month_sin,day_cos,day_sin,dayofweek_cos,dayofweek_sin,season_num_cos,season_num_sin
0,1,A,A,A,A,13,1750.0,A,A,40.05,2018,fall,False,-0.500000,-0.866025,0.97953,2.012985e-01,-0.5,0.866025,-0.5,-0.866025
1,1,B,A,A,A,13,1750.0,A,A,40.05,2018,fall,False,-0.500000,-0.866025,0.97953,2.012985e-01,-0.5,0.866025,-0.5,-0.866025
2,1,C,A,A,A,13,1750.0,A,A,40.05,2018,fall,False,-0.500000,-0.866025,0.97953,2.012985e-01,-0.5,0.866025,-0.5,-0.866025
3,1,D,A,A,A,13,1750.0,A,A,40.05,2018,fall,False,-0.500000,-0.866025,0.97953,2.012985e-01,-0.5,0.866025,-0.5,-0.866025
4,1,E,A,A,A,13,1750.0,A,A,40.05,2018,fall,False,-0.500000,-0.866025,0.97953,2.012985e-01,-0.5,0.866025,-0.5,-0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145579,9,D,A,A,C,6,2040.0,A,A,50.21,2019,fall,False,-0.866025,-0.500000,1.00000,-2.449294e-16,-0.5,0.866025,-0.5,-0.866025
145580,9,E,A,A,C,6,2040.0,A,A,50.21,2019,fall,False,-0.866025,-0.500000,1.00000,-2.449294e-16,-0.5,0.866025,-0.5,-0.866025
145581,9,F,A,A,C,6,2040.0,A,A,50.21,2019,fall,False,-0.866025,-0.500000,1.00000,-2.449294e-16,-0.5,0.866025,-0.5,-0.866025
145582,9,G,A,A,C,6,2040.0,A,A,50.21,2019,fall,False,-0.866025,-0.500000,1.00000,-2.449294e-16,-0.5,0.866025,-0.5,-0.866025


In [23]:
scaler = StandardScaler()
columns_to_normalize = [0, 5, 6, 9, 10]
combined_norm = combined.copy()
normalized_columns = scaler.fit_transform(combined.iloc[:, columns_to_normalize])
combined_norm.iloc[:, columns_to_normalize] = normalized_columns

In [24]:
combined_norm

Unnamed: 0,tourist_area,spot_facility,area,city,type,category,tourism_index,info,event,weather_index,year,season,weekday,month_cos,month_sin,day_cos,day_sin,dayofweek_cos,dayofweek_sin,season_num_cos,season_num_sin
0,-1.700267,A,A,A,A,0.971780,0.083261,A,A,-2.646596,-1.090097,fall,False,-0.500000,-0.866025,0.97953,2.012985e-01,-0.5,0.866025,-0.5,-0.866025
1,-1.700267,B,A,A,A,0.971780,0.083261,A,A,-2.646596,-1.090097,fall,False,-0.500000,-0.866025,0.97953,2.012985e-01,-0.5,0.866025,-0.5,-0.866025
2,-1.700267,C,A,A,A,0.971780,0.083261,A,A,-2.646596,-1.090097,fall,False,-0.500000,-0.866025,0.97953,2.012985e-01,-0.5,0.866025,-0.5,-0.866025
3,-1.700267,D,A,A,A,0.971780,0.083261,A,A,-2.646596,-1.090097,fall,False,-0.500000,-0.866025,0.97953,2.012985e-01,-0.5,0.866025,-0.5,-0.866025
4,-1.700267,E,A,A,A,0.971780,0.083261,A,A,-2.646596,-1.090097,fall,False,-0.500000,-0.866025,0.97953,2.012985e-01,-0.5,0.866025,-0.5,-0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145579,-1.186979,D,A,A,C,-0.533682,0.381076,A,A,0.390055,0.917349,fall,False,-0.866025,-0.500000,1.00000,-2.449294e-16,-0.5,0.866025,-0.5,-0.866025
145580,-1.186979,E,A,A,C,-0.533682,0.381076,A,A,0.390055,0.917349,fall,False,-0.866025,-0.500000,1.00000,-2.449294e-16,-0.5,0.866025,-0.5,-0.866025
145581,-1.186979,F,A,A,C,-0.533682,0.381076,A,A,0.390055,0.917349,fall,False,-0.866025,-0.500000,1.00000,-2.449294e-16,-0.5,0.866025,-0.5,-0.866025
145582,-1.186979,G,A,A,C,-0.533682,0.381076,A,A,0.390055,0.917349,fall,False,-0.866025,-0.500000,1.00000,-2.449294e-16,-0.5,0.866025,-0.5,-0.866025


In [25]:
#CatBoost

In [26]:
train_processed_cb = combined_norm.iloc[:len(train), :]
test_processed_cb = combined_norm.iloc[len(train):, :]

In [27]:
categorialFeatures = ["spot_facility","area", "city", "type", "info", "event","season","weekday"]

In [28]:
model_cb = CatBoostRegressor(iterations = 11000, 
                             learning_rate=0.07, 
                             depth = 7, 
                             task_type = 'GPU', 
                             cat_features = categorialFeatures,
                             early_stopping_rounds=100)


model_cb.fit(train_processed_cb, train_y, verbose=500)

predictions = model_cb.predict(test_processed_cb)

pd_pred = pd.DataFrame(predictions)

ids = test["id"]
answer = pd.DataFrame()
answer["id"] = ids
answer["tourist_arrivals"] = pd_pred
answer["tourist_arrivals"] = np.maximum(answer["tourist_arrivals"], 0)
answer.to_csv('Sub_CB_polu_norm_11000_7_0.07.csv', index=False)

0:	learn: 425.3536441	total: 32.3ms	remaining: 5m 55s
500:	learn: 86.1727589	total: 12.1s	remaining: 4m 12s
1000:	learn: 75.9129156	total: 24.5s	remaining: 4m 4s
1500:	learn: 70.4757667	total: 37.2s	remaining: 3m 55s
2000:	learn: 66.7479969	total: 49.6s	remaining: 3m 43s
2500:	learn: 64.0145378	total: 1m 2s	remaining: 3m 31s
3000:	learn: 61.5929686	total: 1m 15s	remaining: 3m 20s
3500:	learn: 59.3802577	total: 1m 27s	remaining: 3m 7s
4000:	learn: 57.5986287	total: 1m 40s	remaining: 2m 56s
4500:	learn: 56.1413129	total: 1m 53s	remaining: 2m 44s
5000:	learn: 54.9073624	total: 2m 6s	remaining: 2m 32s
5500:	learn: 53.6804202	total: 2m 20s	remaining: 2m 20s
6000:	learn: 52.6373272	total: 2m 33s	remaining: 2m 7s
6500:	learn: 51.6089869	total: 2m 46s	remaining: 1m 55s
7000:	learn: 50.6479464	total: 2m 59s	remaining: 1m 42s
7500:	learn: 49.8606094	total: 3m 13s	remaining: 1m 30s
8000:	learn: 49.0046189	total: 3m 26s	remaining: 1m 17s
8500:	learn: 48.2779113	total: 3m 40s	remaining: 1m 4s
9000:

In [None]:
grid = {'learning_rate': [0.05],
        'depth': [12],
        'iterations': [1200],
        'loss_function': ['RMSE'],
        'cat_features': [categorialFeatures],
        'verbose': [400],
        'task_type': ['GPU'],}



model_cb = CatBoostRegressor()


grid_search = GridSearchCV(estimator=model_cb, param_grid=grid, cv=2)


grid_search.fit(train_processed_cb, train_y)
print("Лучшие параметры:", grid_search.best_params_)
best_params = grid_search.best_params_
model_cb = CatBoostRegressor(iterations=best_params['iterations'],
                             learning_rate=best_params['learning_rate'],
                             depth=best_params['depth'],
                             loss_function=best_params['loss_function'],
                             cat_features=best_params['cat_features'],
                             verbose=best_params['verbose'],
                             task_type=best_params['task_type'])

model_cb.fit(train_processed_cb, train_y)
predictions = model_cb.predict(test_processed_cb)

pd_pred = pd.DataFrame(predictions)

ids = test["id"]
answer = pd.DataFrame()
answer["id"] = ids
answer["tourist_arrivals"] = pd_pred
answer["tourist_arrivals"] = np.maximum(answer["tourist_arrivals"], 0)
answer.to_csv('Sub_CB_new_f_i_grid.csv', index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_processed_cb, train_y, test_size=0.2, random_state=42)

In [None]:
import warnings

# Отключение предупреждений
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.utils.validation")

In [None]:
from tqdm import tqdm

In [None]:
param_grid = {'learning_rate': [0.05, 0.1, 0.2],
              'depth': [10, 12, 15],
              'iterations': [2000, 2500],
              'loss_function': ['RMSE'],
              'cat_features': [categorialFeatures],
              'verbose': [False],
              'task_type': ['GPU']}

learning_rates = []
depths = []
iterations = []
rmse_values = []

min_rmse = float('inf')
best_params = None

iterator = tqdm(itertools.product(*param_grid.values()), total=len(list(itertools.product(*param_grid.values()))), desc="Hyperparameter Tuning")
# Обучение и тестирование модели с разными гиперпараметрами

for params in iterator:
    param_dict = dict(zip(param_grid.keys(), params))

    model = CatBoostRegressor(**param_dict)
    model.fit(X_train, y_train)

    # Оценка на тестовом наборе
    predictions = model.predict(X_test)
    
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    
    iterator.set_postfix(RMSE=rmse)
    
    if rmse < min_rmse:
        min_rmse = rmse
        best_params = param_dict
        
    learning_rates.append(param_dict['learning_rate'])
    depths.append(param_dict['depth'])
    iterations.append(param_dict['iterations'])
    rmse_values.append(rmse)

iterator.close()
    
print(f"\nBest Parameters: {best_params}")
print(f"Best Root Mean Squared Error (RMSE): {min_rmse}")

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(projection='3d')
ax.scatter(learning_rates, depths, iterations, c=rmse_values, cmap='viridis')
ax.set_xlabel('Learning Rate')
ax.set_ylabel('Depth')
ax.set_zlabel('Iterations')
ax.set_title('RMSE for Different Hyperparameters')
plt.show()


model_cb = CatBoostRegressor(iterations=best_params['iterations'],
                             learning_rate=best_params['learning_rate'],
                             depth=best_params['depth'],
                             loss_function = 'RMSE',
                             cat_features = categorialFeatures,
                             verbose= False,
                             task_type = 'GPU')

model_cb.fit(train_processed_cb, train_y)
predictions = model_cb.predict(test_processed_cb)

pd_pred = pd.DataFrame(predictions)

ids = test["id"]
answer = pd.DataFrame()
answer["id"] = ids
answer["tourist_arrivals"] = pd_pred
answer["tourist_arrivals"] = np.maximum(answer["tourist_arrivals"], 0)
answer.to_csv('Sub_CB_night.csv', index=False)

In [None]:
model_cb = CatBoostRegressor(iterations=best_params['iterations'],
                             learning_rate=best_params['learning_rate'],
                             depth=best_params['depth'],
                             loss_function = 'RMSE',
                             cat_features = categorialFeatures,
                             verbose= 500,
                             task_type = 'GPU')

model_cb.fit(train_processed_cb, train_y)
predictions = model_cb.predict(test_processed_cb)

pd_pred = pd.DataFrame(predictions)

ids = test["id"]
answer = pd.DataFrame()
answer["id"] = ids
answer["tourist_arrivals"] = pd_pred
answer["tourist_arrivals"] = np.maximum(answer["tourist_arrivals"], 0)
answer.to_csv('Sub_CB_night.csv', index=False)

In [None]:
answer.to_csv('Sub_CB_night3.csv', index=False)

In [None]:
#One-hot-encoding

In [None]:
combined_o_h = pd.get_dummies(combined, columns = categorialFeatures)

In [None]:
combined_o_h = combined_o_h.replace([False, True], [0, 1])


In [None]:
combined_o_h


In [None]:
train_processed = combined_o_h.iloc[:len(train), :]
test_processed = combined_o_h.iloc[len(train):, :]

train_processed_np = np.array(train_processed)
train_y_np = np.array(train_y).ravel()
test_processed_np = np.array(test_processed)

In [None]:
sc = StandardScaler()
train_processed_np_sc = sc.fit_transform(train_processed_np)
test_processed_np_sc = sc.transform(test_processed_np)

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(train_processed_np_sc, train_y_np)

In [None]:
pred = lr.predict(test_processed_np_sc)

In [None]:
pd_pred = pd.DataFrame(predictions)

ids = test["id"]
answer = pd.DataFrame()
answer["id"] = ids
answer["tourist_arrivals"] = pd_pred
answer["tourist_arrivals"] = np.maximum(answer["tourist_arrivals"], 0)
answer.to_csv('Sub_CB_v4_grid.csv', index=False)

In [None]:
(answer["tourist_arrivals"]< 0).sum()

In [None]:
answer.to_csv('/kaggle/working/Sub_CB_night.csv', index=False)