In [1]:
import requests

vehicle_data = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

url = 'https://fuel-usage-prediction.herokuapp.com/predict'
r = requests.post(url, json = vehicle_data)

r.text.strip()

'{"y_pred":[33.54333333333334,17.64333333333333,21.293333333333333]}'

In [11]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Tansforms the numerical values in the Origin column to Strings
def map_origin_col(input_df: pd.DataFrame) -> pd.DataFrame:
    """_summary_

    Args:
        input_df (pd.DataFrame): _description_

    Returns:
        pd.DataFrame: _description_
    """    
    
    mapped_df = input_df.copy()
    mapped_df["Origin"] = mapped_df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return mapped_df


def num_preproc_pipeline() -> Pipeline:
    """_summary_

    Returns:
        Pipeline: _description_
    """    
    
    imputer = SimpleImputer(strategy="median")
    scaler = StandardScaler()
    
    num_pipeline = Pipeline([("imputer", imputer), 
                             ("scaler", scaler)], 
                            verbose=True)
    
    return num_pipeline

def cat_preproc_pipeline() -> Pipeline:
    """_summary_

    Returns:
        Pipeline: _description_
    """            
    ohe = OneHotEncoder()
    
    cat_pipeline = Pipeline([("one_hot_encoder", ohe)], 
                            verbose=True)
    
    return cat_pipeline

def full_preproc_ct(X_input: pd.DataFrame) -> tuple[pd.DataFrame, ColumnTransformer]:
    """_summary_

    Args:
        X_input (pd.DataFrame): _description_

    Returns:
        tuple[pd.DataFrame, ColumnTransformer]: _description_
    """
    
    num_pipeline = num_preproc_pipeline()
    cat_pipeline = cat_preproc_pipeline()
    
    num_attributes = X_input.select_dtypes(include=["float", "int64"]).columns
    cat_attributes = X_input.select_dtypes(include=["object"]).columns
    print(cat_attributes)
    full_pipeline = ColumnTransformer(
        [("cat", cat_pipeline, cat_attributes), 
         ("num", num_pipeline, num_attributes)],
        verbose=True,
    )

    preprocessed_data = full_pipeline.fit_transform(X_input)

    return preprocessed_data, full_pipeline


In [47]:
input_data = {
  "cylinders": 4,
  "displacement": 155.0,
  "horsepower": 93.0,
  "weight": 2500.0,
  "acceleration": 15.0,
  "model_year": 81,
  "Origin": "India"
}

df = pd.DataFrame([input_data])
df.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,Origin
0,4,155.0,93.0,2500.0,15.0,81,India


In [15]:
test, pipeline = full_preproc_ct(df)

Index(['Origin'], dtype='object')
[Pipeline] ... (step 1 of 1) Processing one_hot_encoder, total=   0.0s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s


In [38]:
cols = ['MPG','Cylinders','Displacement','Horsepower','Weight', 'Acceleration', 'Model Year', 'Origin']

df_real = pd.read_csv("../data/auto-mpg.data", na_values='?', names=cols, comment='\t', sep=' ', skipinitialspace=True)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cylinders     1 non-null      int64  
 1   displacement  1 non-null      float64
 2   horsepower    1 non-null      float64
 3   weight        1 non-null      float64
 4   acceleration  1 non-null      float64
 5   model_year    1 non-null      int64  
 6   Origin        1 non-null      object 
dtypes: float64(4), int64(2), object(1)
memory usage: 184.0+ bytes


In [50]:
df

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,Origin
0,4,155.0,93.0,2500.0,15.0,81,India


In [52]:
df_real.iloc[0:1, :].drop("MPG", axis=1)

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,8,307.0,130.0,3504.0,12.0,70,1


In [55]:
sample_data, sample_pipeline = full_preproc_ct(df_real.iloc[0:2, :].drop("MPG", axis=1))

Index([], dtype='object')
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer] ........... (1 of 1) Processing num, total=   0.0s


In [56]:
sample_data

array([[ 0., -1., -1., -1.,  1.,  0.,  0.],
       [ 0.,  1.,  1.,  1., -1.,  0.,  0.]])

In [42]:
real_data, real_pipeline = full_preproc_ct(df_real.drop("MPG", axis=1))

Index([], dtype='object')
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer] ........... (1 of 1) Processing num, total=   0.0s


In [44]:
real_data.shape

(398, 7)

In [11]:
from minio import Minio
import io

FILE_PATH = "data/auto-mpg.data"
MODEL_PATH = "artifacts/model.joblib"
import os
MINIO_API_HOST = "127.0.0.1:9000"
MINIO_TEST_USER=os.environ()
MINIO_TEST_PASSWORD=

minio_client = Minio(endpoint=MINIO_API_HOST, access_key=ACCESS_KEY, secret_key=SECRET_KEY, secure=False)


minio_client.fput_object("test-bucket", "blabba/sey/auto-mpg.names", "auto-mpg.names")

# minio_client.put_object(
#         bucket_name="test-bucket",
#         object_name="model/test",
#         data=io.BytesIO(bytes_model_pipeline),
#         length=len(bytes_model_pipeline)
#     )

<minio.helpers.ObjectWriteResult at 0x223bfc219a0>