# Lab 2 Data Prep
1. Read in dataset
2. Clean up data
3. Visualize dataset

## 1. Read in dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# from pydotplus import graph_from_dot_data
import graphviz
from IPython.display import Image

plt.style.use("fivethirtyeight")
pd.plotting.register_matplotlib_converters()

import warnings
warnings.filterwarnings('ignore')

%store -r STUDENT

In [None]:
# Project repo path function - file system mount available to all app containers
def ProjectRepo(path):
    ProjectRepo = "/bd-fs-mnt/project_repo"
    return str(ProjectRepo + '/' + path)

In [None]:
# load input data into pandas dataframe
bike_sharing = pd.read_csv(ProjectRepo("/data/" + STUDENT + "_BikeShare/hour.csv"))
bike_sharing 

## Take a quick glance at our dataset. These are the columns (features) that we will be working with. Our target variable is the "cnt" column. 
## For example, the first row on Jan 1st, 2011 along with the weather columns had a bike rental count of 16. 

## 2. Clean dataset

In [None]:
# remove unused columns
bike_sharing.drop(columns=["instant", "dteday", "registered", "casual"], inplace=True)

# use better names
bike_sharing.rename(
    columns={
        "yr": "year",
        "mnth": "month",
        "hr": "hour_of_day",
        "holiday": "is_holiday",
        "workingday": "is_workingday",
        "weathersit": "weather_situation",
        "temp": "temperature",
        "atemp": "feels_like_temperature",
        "hum": "humidity",
        "cnt": "rented_bikes",
    },
    inplace=True,
)

cols = bike_sharing.select_dtypes(exclude=['float64']).columns

for i in ['season', 'year', 'month', 'hour_of_day', 'is_holiday', 'weekday',
       'is_workingday', 'weather_situation', 'rented_bikes']:
    bike_sharing[i] = bike_sharing[i].astype('float64')

## 3. Visualize dataset
- We're going to sum up all the rented bikes and group by the hour of day. You will see that the 2 peaks occur around 8:00 and 17:00
- This makes sense as people start renting at the beginning of the day and maybe even at the end of the day for a future trip of some sort. 

In [None]:
hour_of_day_agg = bike_sharing.groupby(["hour_of_day"])["rented_bikes"].sum()

hour_of_day_agg.plot(
    kind="line", 
    title="Total rented bikes by hour of day",
    xticks=hour_of_day_agg.index,
    figsize=(15, 10),
)

In [None]:
# Save cleaned data
X = bike_sharing.drop("rented_bikes", axis=1)
y = bike_sharing.rented_bikes

X.to_csv(ProjectRepo("data/" + STUDENT + "_BikeShare/X_clean.csv"))
y.to_csv(ProjectRepo("data/" + STUDENT + "_BikeShare/y_clean.csv"))

# Now that we've imported our dataset and cleaned it up, we can start modeling! Continue to Lab 3