<a href="https://colab.research.google.com/github/MalikAsadAftab/Seoul-Bike-Renting-Synthetic-Data-Generation/blob/main/Seoul_Bike_Renting_Synthetic_Data_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'seoulbikedata:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4950435%2F8335660%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240508%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240508T132054Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D06ffe6fba5ef09b9f67331a0e510eb3bd1d957af73b9379ee6db18eb894ca7b591963c749cbe2dbf4e8ab1aa6e1930f1bbad7a7b34dce2b5b523478eca7ca35a0db9e9dc63344a22f3883352918431a39846d25fb76fbd8c340b042595bd72dad3fe156b20359c278f5618fcd1f3902a052380dc2ad719f55f38c279df481254cb8dc79e503db00c33bd4a1319bf5a3789f1f70d03f9b9786dcf3e1c079ccb4239e4de6eaa29d2d230caa31d8546cc12ad3019d645f094939299e18332d1cf7a0444c0163b7ac58672a5affb7d8372bcd5026f1906f1cd3f57408d134390435cfa5df723400f0a2ac1dc8a707b9c7dabf420fe0060510d9f124904f97b544d3d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading seoulbikedata, 128783 bytes compressed
Downloaded and uncompressed: seoulbikedata
Data source import complete.


# **1. Importing Libraries & Data**
First we need to install the [SDV](https://docs.sdv.dev/sdv/installation) package. Then we will import some libraries that we will be using throughout our implementation.

In [2]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.12.1-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.5/133.5 kB[0m [31m946.6 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3>=1.28 (from sdv)
  Downloading boto3-1.34.100-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m886.9 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore>=1.31 (from sdv)
  Downloading botocore-1.34.100-py3-none-any.whl (12.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting copulas>=0.11.0 (from sdv)
  Downloading copulas-0.11.0-py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.9/51.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctgan>=0.10.0 (from sdv)
  Downloading ctgan-0.10.0-py3-none-any.whl (24 kB)
Collecting deepecho>=0.6.0 (from sdv)
  Downloading deepecho-0

In [3]:
# Import necessary libraries
!pip install xgboost
!pip install lightgbm
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sdv.datasets.local import load_csvs
from sklearn.model_selection import train_test_split, GridSearchCV,  cross_val_score
from sklearn import preprocessing, linear_model
from sklearn.preprocessing import  LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from sklearn.linear_model import Ridge, Lasso, LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR
from sklearn import tree
import xgboost as xgb
from xgboost import plot_importance
from lightgbm import LGBMRegressor



import warnings
warnings.filterwarnings('ignore')

pd.pandas.set_option('display.max_columns',None)
%matplotlib inline



### *1.1 Data Loading*

We can now create a Pandas dataframe using the downloaded file, to view and analyze the data.

In [4]:
datasets = load_csvs(
    folder_name='/kaggle/input/seoulbikedata/', #Add the address of the folder where the dataset resides
    read_csv_parameters={
        'skipinitialspace': True,
        'encoding': 'unicode_escape'
    })

# the data is available under the file name
data = datasets['SeoulBikeData']

### *1.2 Data Diagnosis*

In [5]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rented Bike Count,8760.0,704.602055,644.997468,0.0,191.0,504.5,1065.25,3556.0
Hour,8760.0,11.5,6.922582,0.0,5.75,11.5,17.25,23.0
Temperature(°C),8760.0,12.882922,11.944825,-17.8,3.5,13.7,22.5,39.4
Humidity(%),8760.0,58.226256,20.362413,0.0,42.0,57.0,74.0,98.0
Wind speed (m/s),8760.0,1.724909,1.0363,0.0,0.9,1.5,2.3,7.4
Visibility (10m),8760.0,1436.825799,608.298712,27.0,940.0,1698.0,2000.0,2000.0
Dew point temperature(°C),8760.0,4.073813,13.060369,-30.6,-4.7,5.1,14.8,27.2
Solar Radiation (MJ/m2),8760.0,0.569111,0.868746,0.0,0.0,0.01,0.93,3.52
Rainfall(mm),8760.0,0.148687,1.128193,0.0,0.0,0.0,0.0,35.0
Snowfall (cm),8760.0,0.075068,0.436746,0.0,0.0,0.0,0.0,8.8


### *1.3 Data Duplicate Check*

In [6]:
# Check for duplicated entries.
print("Duplicate entry in data:",len(data[data.duplicated()]))
data = data.drop_duplicates()

Duplicate entry in data: 0
