<a href="https://colab.research.google.com/github/Janani-360/OVFAcademy/blob/main/Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Data Manipulation
# Boat price
#### Here is the script to get the data directly from kaggle through colab

!pip install -q kaggle
from google.colab import files
files.upload()
#create a kaggle folder
!mkdir ~/.kaggle

# Go on kaggle > Account > Create New API token
# Save the json file in your laptop in a dedicated folder

# copy the kaggle.json to folder created
!cp kaggle.json ~/.kaggle
#permission for the json to act
!chmod 600 ~/.kaggle/kaggle.json

# Datasets available here: 
# https://www.kaggle.com/datasets/artemkorottchenko/large-boatyacht-pricing-dataset
!kaggle datasets download -d artemkorottchenko/large-boatyacht-pricing-dataset
!unzip large-boatyacht-pricing-dataset.zip

#### End of the script to get the data directly from kaggle through colab
# Remember that when you run it, you have to click on the button Choose Files and 
# then select the kaggle.json file from your computer

Saving kaggle.json to kaggle.json
Downloading large-boatyacht-pricing-dataset.zip to /content
  0% 0.00/6.22M [00:00<?, ?B/s]
100% 6.22M/6.22M [00:00<00:00, 71.2MB/s]
Archive:  large-boatyacht-pricing-dataset.zip
  inflating: boat_dataset.csv        


In [6]:
import pandas as pd
import io
import numpy as np
pd.options.display.max_columns = 100
pd.options.display.max_rows = 50
df = pd.read_csv('/content/boat_dataset.csv', encoding='latin1')

In [7]:
df = pd.read_csv(io.StringIO(open('/content/boat_dataset.csv', 'rb').read().decode('utf-8', errors='ignore')), sep=",")
#df.head()

In [8]:
## Add element in the database for EDUCATIONAL PURPOSE ONLY
# Add random date in order to do a proper temporal split
df['Advertisement Date'] = pd.to_datetime(df['Advertisement Date'], format='%d.%m.%Y')
#Here is a function that will automatically create a feature of random dates between a defined interval
def random_datetimes_or_dates(start, end, out_format='datetime', n=10): 

    '''   
    unix timestamp is in ns by default. 
    I divide the unix time value by 10**9 to make it seconds 
    (or 24*60*60*10**9 to make it days).
    The corresponding unit variable is passed to 
    the pd.to_datetime function. 
    Values for the (divide_by, unit) pair to select is defined by 
    the out_format parameter.
    for 1 -> out_format='datetime'
    for 2 -> out_format=anything else
    '''
    (divide_by, unit) = (10**9, 's') if out_format=='datetime' else (24*60*60*10**9, 'D')

    start_u = start.value//divide_by
    end_u = end.value//divide_by

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit=unit)


np.random.seed(893717398)
# Set parameters
d_start = pd.to_datetime('2020-01-01')
d_end = pd.to_datetime('2021-12-31')

# Delete initial Advertisement Date -> if you want to keep the inital dates, don't run it
df.drop(columns=['Advertisement Date'], inplace=True)

# Run function
df['Advertisement Date'] = random_datetimes_or_dates(d_start, d_end, out_format='datetime', n=len(df))

# Keep the date and remove the time
df['Advertisement Date']= df['Advertisement Date'].dt.date

# Parse the date
df['Advertisement Date'] = pd.to_datetime(df['Advertisement Date'], format='%Y-%m-%d')

# Impute Advertissement Date by the the created one when they are missing
#df['Advertisement Date'] = np.where(df['Advertisement Date'].isna(), df['date2'], df['Advertisement Date'])

# Delete the random date created
#df.drop(columns=['date2'], inplace=True)

# Create duplicates
temp = df.sample(n=500, random_state=893717398)
df = df.append(temp)
del(temp)
df.reset_index(drop=True, inplace=True)

df['Advertisement Date'] = df['Advertisement Date'].astype(str)

In [9]:
df.shape

(10844, 38)

In [11]:
df.describe()

Unnamed: 0,Year Built,Cert Number of People,Number of Cabins,Number of beds,Number of Toilets,Number of Bathrooms,Number of Showers
count,10252.0,3773.0,6807.0,6726.0,2257.0,484.0,2040.0
mean,2004.831155,9.338723,2.145585,4.339578,1.756314,1.295455,1.55049
std,16.33031,23.478952,2.102946,3.902195,2.038631,0.654258,0.925578
min,1885.0,1.0,-1.0,-23.0,1.0,1.0,1.0
25%,1999.0,7.0,1.0,2.0,1.0,1.0,1.0
50%,2008.0,8.0,2.0,4.0,1.0,1.0,1.0
75%,2017.0,10.0,3.0,6.0,2.0,1.0,2.0
max,2021.0,1200.0,96.0,266.0,84.0,6.0,10.0


In [12]:
df['Fuel Type'].unique()

array([nan, 'Unleaded', 'Electric', 'Diesel', 'Gas', 'Hybrid', 'Propane'],
      dtype=object)

In [24]:
mask3 = (((df['Year Built'] >= 2011) | (df['Year Built'] <= 2001)) & (df['Number of Cabins'] <= 2))
df[mask3].describe()

Unnamed: 0,Year Built,Cert Number of People,Number of Cabins,Number of beds,Number of Toilets,Number of Bathrooms,Number of Showers
count,3118.0,1426.0,3118.0,2548.0,953.0,275.0,856.0
mean,2003.623477,8.425666,1.499679,3.482339,1.164743,1.134545,1.193925
std,17.807547,2.332474,0.502001,1.54226,0.371143,0.362583,0.410118
min,1889.0,1.0,-1.0,-23.0,1.0,1.0,1.0
25%,1993.0,7.0,1.0,2.0,1.0,1.0,1.0
50%,2011.0,8.0,2.0,4.0,1.0,1.0,1.0
75%,2018.0,10.0,2.0,4.0,1.0,1.0,1.0
max,2021.0,30.0,2.0,9.0,2.0,3.0,4.0
