In [6]:
!pip freeze | grep scikit-learn

scikit-learn==1.5.1


In [7]:
!python -V

Python 3.9.19


In [37]:
import pickle
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

In [9]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [10]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [12]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [13]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

## What's the standard deviation of the predicted duration for this dataset?

In [36]:
print("The predicted standard deviation is : " ,y_pred.std())


SyntaxError: unexpected EOF while parsing (874362886.py, line 2)

## Q2. Preparing the output

In [16]:
year=2023
month=3
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')


In [21]:
df_result = pd.DataFrame()
df_result['ride_id'] = df['ride_id']
df_result['predicted_duration'] = y_pred
output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet'

df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [22]:
# get file size in python
import os


file_stats = os.stat(output_file)

print(file_stats)
print(f'File Size in Bytes is {file_stats.st_size}')
print(f'File Size in MegaBytes is {file_stats.st_size / (1024 * 1024)}')


os.stat_result(st_mode=33188, st_ino=79492024, st_dev=16777221, st_nlink=1, st_uid=502, st_gid=20, st_size=68641880, st_atime=1724072109, st_mtime=1724072108, st_ctime=1724072108)
File Size in Bytes is 68641880
File Size in MegaBytes is 65.46199798583984


In [24]:
# or we can use cli
!ls -lh output


total 134072
-rw-r--r--@ 1 epam  staff    65M Aug 19 15:55 yellow_tripdata_2023-03.parquet


## Creating the scoring script

In [25]:
!jupyter nbconvert --to script starter.ipynb


[NbConvertApp] Converting notebook starter.ipynb to script
[NbConvertApp] Writing 1896 bytes to starter.py


## Q4. Virtual environment

In [26]:
!pip install pipenv

Looking in indexes: https://pypi.org/simple, https://aws:****@camo-149671618541.d.codeartifact.us-east-1.amazonaws.com/pypi/camo-platform-sdk/simple/


In [31]:
 !pipenv install scikit-learn==1.5.1 pandas pyarrow

[1;32mInstalling scikit-[0m[1;33mlearn[0m[1;32m==[0m[1;36m1.5[0m[1;32m.[0m[1;36m1[0m[1;33m...[0m
[?25lResolving scikit-[33mlearn[0m==[1;36m1.5[0m.[1;36m1[0m[33m...[0m
[2K[1mAdded [0m[1;32mscikit-learn[0m to Pipfile's [1;33m[[0m[33mpackages[0m[1;33m][0m [33m...[0m
[2K✔ Installation Succeeded-learn...
[2K[32m⠋[0m Installing scikit-learn...
[1A[2K[1;32mInstalling pandas[0m[1;33m...[0m
[?25lResolving pandas[33m...[0m
[2K[1mAdded [0m[1;32mpandas[0m to Pipfile's [1;33m[[0m[33mpackages[0m[1;33m][0m [33m...[0m
[2K✔ Installation Succeeded...
[2K[32m⠋[0m Installing pandas...
[1A[2K[1;32mInstalling pyarrow[0m[1;33m...[0m
[?25lResolving pyarrow[33m...[0m
[2K[1mAdded [0m[1;32mpyarrow[0m to Pipfile's [1;33m[[0m[33mpackages[0m[1;33m][0m [33m...[0m
[2K✔ Installation Succeededw...
[2K[32m⠋[0m Installing pyarrow...
[1A[2K[1;33mPipfile.lock [0m[1;33m([0m[1;33mdc00bf[0m[1;33m)[0m[1;33m out

##### What's the first hash for the Scikit-Learn dependency?



`sha256:0828673c5b520e879f2af6a9e99eee0eefea69a2188be1ca68a6121b809055c1`


## Q5. Parametrize the script


In [40]:
!python python_script.py 2023 04

The predicted standard deviation is :  6.353996941249663
The predicted mean duration is:  14.292282936862449
File Size in MegaBytes is 63.14436435699463


(llm2) epam@EPILHERW0032 homework % docker run -it --rm -p 9696:9696  hw4-service:v3 2023 05





The predicted mean duration is:  0.19174419265916945


In [None]:
import boto3
f=open("aws_cred.txt","r")
lines=f.readlines()
ACCESS_SECRET_KEY=lines[0].strip()
ACCESS_KEY_ID=lines[1].strip()
BUCKET_NAME = lines[2].strip()
f.close()
print(ACCESS_SECRET_KEY,ACCESS_KEY_ID,BUCKET_NAME)

# S3 Connect
s3 = boto3.resource('s3',
                    aws_access_key_id=ACCESS_KEY_ID,
                    aws_secret_access_key=ACCESS_SECRET_KEY)

s3.Bucket(BUCKET_NAME).put_object(Key= "model.bin", ACL='public-read')

