# Libraries and functions that I'll be use 

In [None]:
! pip install sklearn

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import zscore
from sklearn.metrics import mean_squared_error
import xgboost as xgb


plt.style.use('fivethirtyeight')

ModuleNotFoundError: No module named 'sklearn'

# 1. Let's take a look at the data 

In [None]:
data = pd.read_csv('./train.csv')

In [None]:
data  = data.set_index('date')

In [None]:
data.index = pd.to_datetime(data.index)
data.index

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.plot(kind='hist',bins=500)

In [None]:
data.plot(style='.',
          figsize=(15,5),
          title='Sleeping hours')

# 2. Removing outliers

In [None]:
data.describe()

In [None]:
# def remove_outlier(data,threshold=2.5):
#     # create a new colum representing the z-score for each entery
#     if 'sleep_hours_zscore' not in data.columns.values[0]:
#         data['sleep_hours_zscore'] = zscore(data['sleep_hours'])
#         # threshold = 3, so we will remove any entry if its z-score greater than 3
#         z_threshold = 2.5
#         data = data[data['sleep_hours_zscore'].abs() < z_threshold]
#         return data
#     print('sleep_hours_zscore already exists')
#     return data

# the data from 2018 to 2019 maybe have been doupled the value let's divide them by 2
data["sleep_hours"].iloc[833:1090] /= 2 

In [None]:
data.plot(style='.',
          figsize=(15,5),
          title='Sleeping hours')

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data['sleep_hours'].plot(style='.',
          figsize=(15,5),
          title='Sleeping hours')

In [None]:
# how one month looks like
data.loc[((data.index > '2015-02-20') & (data.index < '2015-03-20'))]['sleep_hours'].plot(title="One month",figsize=(15,5))

In [None]:
data

# 3. Create features

In [None]:
def create_features(data):
    '''
        Create features 
    '''
    data =  data.copy()
    data['week_days'] = data.index.dayofweek
    data['quarter'] = data.index.quarter
    data['month'] = data.index.month
    data['year_days'] = data.index.dayofyear
    data['dayofmonth'] = data.index.day
    data['weekofyear'] = data.index.isocalendar().week
    return data
data = create_features(data)

In [None]:
data.head()

In [None]:
data['weekofyear'] = data['weekofyear'].astype(int)

In [None]:
data.info()

# 4. Cross validation

In [None]:
from sklearn.model_selection import TimeSeriesSplit

tss = TimeSeriesSplit(n_splits=4,test_size=250)
data = data.sort_index()

In [None]:
fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)

fold = 0
for train_idx, val_idx in tss.split(data):
    train = data.iloc[train_idx]
    test = data.iloc[val_idx]
    train['sleep_hours'].plot(ax=axs[fold],
                          label='Training Set',
                          title=f'Data Train/Test Split Fold {fold}')
    test['sleep_hours'].plot(ax=axs[fold],
                         label='Test Set')
    axs[fold].axvline(test.index.min(), color='black', ls='--')
    fold += 1
plt.show()

# 5. Lag Features


In [None]:
def adding_lags(data):
    map_target = data['sleep_hours'].to_dict()
    
    data['lag_1'] = (data.index - pd.Timedelta('182 days')).map(map_target)
    data['lag_2'] = (data.index - pd.Timedelta('91 days')).map(map_target)
    data['lag_3'] = (data.index - pd.Timedelta('45 days')).map(map_target)
    data['lag_4'] = (data.index - pd.Timedelta('13 days')).map(map_target)
    data['lag_5'] = (data.index - pd.Timedelta('7 days')).map(map_target)
    data['lag_6'] = (data.index - pd.Timedelta('1 days')).map(map_target)

    return data

    

In [None]:
data = adding_lags(data)

In [None]:
data.columns


In [None]:
data.info()

# Time to train using cross validation

In [None]:
tss = TimeSeriesSplit(n_splits=5)
data = data.sort_index()

fold = 0
predicts = []
scores = []

for train_index,test_index in tss.split(data):
    train = data.iloc[train_index]
    test = data.iloc[test_index]
    
    train = create_features(train)
    test = create_features(test)
    
    features = ['week_days', 'quarter', 'month', 'year_days',
       'dayofmonth', 'lag_1', 'lag_2', 'lag_3', 'lag_4']
    target = 'sleep_hours'
    
    X_train = train[features]
    y_train = train[target]
    
    X_test = test[features]
    y_test = test[target]
    
    reg_model = xgb.XGBRegressor(base_score=0.5,
                                booster='gbtree',
                                n_estimators=7000,
                                early_stopping_rounds=50,
                                learning_rate=0.001,
                                objective='reg:squarederror',
                                max_depth=4
                                )
    reg_model.fit(X_train,y_train,
                 eval_set = [(X_train,y_train),(X_test,y_test)],verbose=1000)
    y_predict = reg_model.predict(X_test)
    score = np.sqrt(mean_squared_error(y_test,y_predict))
    scores.append(score)

In [None]:
print(f'Score across folds {np.mean(scores):0.4f}')
print(f'Fold scores:{scores}')

# Time to retrain on all data

In [None]:
# finding the best parameter for the model
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Define the parameter grid
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'subsample': [0.5, 0.75, 1],
    'colsample_bytree': [0.5, 0.75, 1],
    'gamma': [0, 0.1, 0.2],
}

# Create the XGBRegressor model
xgb_model = XGBRegressor(objective='reg:squarederror')

# Define the GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5)

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

features = ['week_days', 'quarter', 'month', 'year_days',
       'dayofmonth', 'lag_1', 'lag_2', 'lag_3', 'lag_4']
target = 'sleep_hours'

all_X = data[features]
all_y = data[target]

reg_model = xgb.XGBRegressor(base_score=0.5,
                                booster='gbtree',
                                n_estimators=8000,
                                early_stopping_rounds=50,
                                learning_rate=0.001,
                                objective='reg:squarederror',
                                max_depth=4)
reg_model.fit(all_X,all_y,
             eval_set=[(all_X,all_y)],
            verbose=1000)

from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor





# Predicting the test data

In [None]:
data_test = pd.read_csv('./test.csv',skiprows=0)
data_test.set_index('date',inplace=True)
data_test.index = pd.to_datetime(data_test.index)
data_test = data_test.drop(columns='sleep_hours')

In [None]:

data_test.head()

In [None]:
data_test['isFuture'] = True
data['isFuture'] = False
data_and_test_data = pd.concat([data,data_test])
data_and_test_data = create_features(data_and_test_data)
data_and_test_data = adding_lags(data_and_test_data)
data_and_test_data

In [None]:
future_data = data_and_test_data.query('isFuture').copy()
future_data

# Now let's predict the test data

In [None]:
future_data['sleep_hours'] = reg_model.predict(future_data[features])
future_data

In [None]:
result = pd.DataFrame(index=future_data.index,data=future_data['sleep_hours'],columns=['sleep_hours'])
result

In [None]:
result.to_csv('submission.csv')

In [None]:
res = pd.read_csv('/kaggle/working/predictions.csv')
res