In [3]:
!python -V

Python 3.9.10


In [14]:
import pickle
import zipfile
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [5]:

def read_data(url):
    """
    Capital Bikeshare datasets are zipped
    We need to download then extract the csv
    """
    zip_path = url.split('/')[-1] 
    file_name = zip_path.split('.')[0] + '.csv'

    req = requests.get(url)

    with open(zip_path, 'wb') as f_out:
        f_out.write(req.content)

    with zipfile.ZipFile(zip_path) as z:
        with z.open(file_name) as f:
            df = pd.read_csv(f, parse_dates=True)
            
    categorical_cols = ['rideable_type', 'start_station_id', 'end_station_id']
    date_cols = ['started_at', 'ended_at']
    
    df[categorical_cols] = df[categorical_cols].astype(str)
    df[date_cols] = df[date_cols].apply(pd.to_datetime, format='%Y/%m/%d %H:%M:%S')
    
    df['duration'] = df['ended_at'] - df['started_at']
    df['duration'] = df['duration'].apply(lambda x: round(x.total_seconds() / 60, 0))
    df['start_end'] = df['start_station_id'] + '_' + df['end_station_id']
    
    return df

In [6]:
df = read_data('https://s3.amazonaws.com/capitalbikeshare-data/202207-capitalbikeshare-tripdata.zip')

In [7]:
categorical = ['rideable_type', 'start_end']

dv = DictVectorizer()
target = 'duration'

dicts = df[categorical].to_dict(orient='records')
x = dv.fit_transform(dicts)
y = df[target].values

In [8]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=42)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((314946, 59522), (78737, 59522), (314946,), (78737,))

In [19]:
lr = Ridge()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_val)

mean_squared_error(y_val, y_pred, squared=False)

193.70306787980553

In [18]:
with open('ridge.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)