## 0. IMPORTING

In [1]:
from fastparquet import ParquetFile


from sklearn.metrics             import mean_squared_error

from sklearn.linear_model       import LinearRegression
from sklearn.linear_model       import Lasso
from sklearn.linear_model       import Ridge

from sklearn.feature_extraction import DictVectorizer

### 0.2. Constant Variables

In [2]:
BASE_PATH = '../data/'

## 1. DATA LOADING AND DESCRIPTION

In [3]:
fhv_data_jan = ParquetFile(BASE_PATH + 'fhv_tripdata_2021-01.parquet').to_pandas()
fhv_data_feb = ParquetFile(BASE_PATH + 'fhv_tripdata_2021-02.parquet').to_pandas()

### 1.1. Data Dimensions

In [4]:
print('FHV Number of Rows (Jan): ', fhv_data_jan.shape[0])
print('FHV Number of Columns (Jan): ', fhv_data_jan.shape[1])

FHV Number of Rows (Jan):  1154112
FHV Number of Columns (Jan):  7


In [5]:
print('FHV Number of Rows (Feb): ', fhv_data_feb.shape[0])
print('FHV Number of Columns (JanuFebary): ', fhv_data_feb.shape[1])

FHV Number of Rows (Feb):  1037692
FHV Number of Columns (JanuFebary):  7


### 1.2. Missing Values

Let's check if there are missing value in the dataset.

In [6]:
fhv_data_jan.isna().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               958267
DOlocationID               162220
SR_Flag                   1154112
Affiliated_base_number        885
dtype: int64

In [7]:
fhv_data_feb.isna().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               884691
DOlocationID               152352
SR_Flag                   1037692
Affiliated_base_number          0
dtype: int64

## ANSWERING HOMEWORK QUESTIONS

### Q1: How many records are there for Jan?

In [8]:
print('FHV Number of Rows (Jan): ', fhv_data_jan.shape[0])

FHV Number of Rows (Jan):  1154112


### Q2: What's the average trip duration in January?

In [9]:
fhv_data_jan['duration'] = fhv_data_jan['dropOff_datetime'] - fhv_data_jan['pickup_datetime']

In [10]:
fhv_data_jan['duration'] = fhv_data_jan.duration.apply(lambda td: td.total_seconds() / 60)

In [11]:
print(f"The average trip duration in January is {round(fhv_data_jan['duration'].mean(), 3)} minutes")

The average trip duration in January is 19.167 minutes


### Removing Outliers

In [12]:
fhv_data_jan = fhv_data_jan[(fhv_data_jan.duration >= 1) & (fhv_data_jan.duration <= 60)]

In [13]:
fhv_data_jan.shape

(1109826, 8)

### Q3: Missing values

In [14]:
fhv_data_jan['PUlocationID'] = fhv_data_jan['PUlocationID'].fillna(-1)
fhv_data_jan['DOlocationID'] = fhv_data_jan['DOlocationID'].fillna(-1)

In [15]:
na_count = fhv_data_jan[fhv_data_jan['PUlocationID'] == -1].shape[0] / fhv_data_jan.shape[0]

In [16]:
print(f'The fraction of missing values for pickup ID is {na_count*100:.1f}%')

The fraction of missing values for pickup ID is 83.5%


### Q4: One-hot encoding

In [17]:
categorical = ['PUlocationID', 'DOlocationID']
fhv_data_jan[categorical] = fhv_data_jan[categorical].astype(str)

In [18]:
train_dicts = fhv_data_jan[categorical].to_dict(orient='records')

dv = DictVectorizer()

In [19]:
X_train = dv.fit_transform(train_dicts)

X_train.shape

(1109826, 525)

In [20]:
print(f"The number of columns of the matrix is {X_train.shape[1]}.")

The number of columns of the matrix is 525.


### Q5: What's the RMSE on train?

In [21]:
target = 'duration'
y_train = fhv_data_jan[target].values

#### Fitting the model

In [22]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

In [23]:
print(f"The RMSE on train id {mean_squared_error(y_train, y_pred, squared=False)}")

The RMSE on train id 10.528519388409808


### Q6: What's the RMSE on validation?

### Preparing the february data

In [25]:
#duration
fhv_data_feb['duration'] = fhv_data_feb['dropOff_datetime'] - fhv_data_feb['pickup_datetime']
fhv_data_feb['duration'] = fhv_data_feb.duration.apply(lambda td: td.total_seconds() / 60)

# removing outliers
fhv_data_feb = fhv_data_feb[(fhv_data_feb.duration >= 1) & (fhv_data_feb.duration <= 60)]

# filling missing values
fhv_data_feb['PUlocationID'] = fhv_data_feb['PUlocationID'].fillna(-1)
fhv_data_feb['DOlocationID'] = fhv_data_feb['DOlocationID'].fillna(-1)

# one-hot encoding
categorical = ['PUlocationID', 'DOlocationID']
fhv_data_feb[categorical] = fhv_data_feb[categorical].astype(str)

test_dicts = fhv_data_feb[categorical].to_dict(orient='records')

X_val = dv.transform(test_dicts)

# target varaible
y_val = fhv_data_feb[target].values

In [26]:
y_pred = lr.predict(X_val)

In [27]:
print(f"The RMSE on validation id {mean_squared_error(y_val, y_pred, squared=False)}")

The RMSE on validation id 11.014287519486222
