Predicting the price of Pumpking

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [7]:
#Loading my data
df = pd.read_csv(r'C:\Users\hp\Desktop\Locked\02. Data Science\02. Machine Learning\6. Regression\Data\US-pumpkins.csv')
print(df.head())

   City Name Type       Package      Variety Sub Variety  Grade     Date  \
0  BALTIMORE  NaN  24 inch bins          NaN         NaN    NaN  4/29/17   
1  BALTIMORE  NaN  24 inch bins          NaN         NaN    NaN   5/6/17   
2  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  9/24/16   
3  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  9/24/16   
4  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  11/5/16   

   Low Price  High Price  Mostly Low  ...  Unit of Sale Quality Condition  \
0      270.0       280.0       270.0  ...           NaN     NaN       NaN   
1      270.0       280.0       270.0  ...           NaN     NaN       NaN   
2      160.0       160.0       160.0  ...           NaN     NaN       NaN   
3      160.0       160.0       160.0  ...           NaN     NaN       NaN   
4       90.0       100.0        90.0  ...           NaN     NaN       NaN   

  Appearance Storage  Crop Repack  Trans Mode  Unnamed: 24  Unnamed: 25  
0     

In [8]:
df = df.drop(columns=['Type', 'City Name', 'Sub Variety', 'Grade', 'Date', 'Condition', 'Origin', 'Origin District', 'Mostly Low', 'Mostly High', 'Environment', 'Quality', 'Unit of Sale', 'Unnamed: 24', 'Unnamed: 25', 'Trans Mode', 'Repack', 'Crop', 'Storage', 'Appearance', ])
df.head()

Unnamed: 0,Package,Variety,Low Price,High Price,Item Size,Color
0,24 inch bins,,270.0,280.0,lge,
1,24 inch bins,,270.0,280.0,lge,
2,24 inch bins,HOWDEN TYPE,160.0,160.0,med,ORANGE
3,24 inch bins,HOWDEN TYPE,160.0,160.0,med,ORANGE
4,24 inch bins,HOWDEN TYPE,90.0,100.0,lge,ORANGE


In [9]:
df.tail()

Unnamed: 0,Package,Variety,Low Price,High Price,Item Size,Color
1752,22 lb cartons,MINIATURE,14.75,14.75,,WHITE
1753,36 inch bins,MINIATURE,275.0,275.0,sml,
1754,36 inch bins,MINIATURE,275.0,275.0,sml,
1755,36 inch bins,MINIATURE,275.0,275.0,sml,
1756,36 inch bins,MINIATURE,275.0,275.0,sml,


In [17]:
#Now i will use the low price and high price to get the price column
df['Price'] = (df['Low Price']) + (df['High Price']) / 2
df['Price'] = (df['Price']) / 2
print(df)

            Package      Variety  Low Price  High Price Item Size   Color  \
0      24 inch bins          NaN     270.00      280.00       lge     NaN   
1      24 inch bins          NaN     270.00      280.00       lge     NaN   
2      24 inch bins  HOWDEN TYPE     160.00      160.00       med  ORANGE   
3      24 inch bins  HOWDEN TYPE     160.00      160.00       med  ORANGE   
4      24 inch bins  HOWDEN TYPE      90.00      100.00       lge  ORANGE   
...             ...          ...        ...         ...       ...     ...   
1752  22 lb cartons    MINIATURE      14.75       14.75       NaN   WHITE   
1753   36 inch bins    MINIATURE     275.00      275.00       sml     NaN   
1754   36 inch bins    MINIATURE     275.00      275.00       sml     NaN   
1755   36 inch bins    MINIATURE     275.00      275.00       sml     NaN   
1756   36 inch bins    MINIATURE     275.00      275.00       sml     NaN   

         Price  
0     205.0000  
1     205.0000  
2     120.0000  
3     1

In [39]:
# Separate features and target
X = df.drop('Price', axis=1)
y = df['Price'] 

In [40]:
# We can Define categorical and numerical features
categorical_features = ['Variety', 'Color', 'Package', 'Item Size']
numerical_features = ['Low Price', 'High Price']

# Lets Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Now i split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# We now create pipeline with preprocessing and model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
#Let Train the model
model.fit(X_train, y_train)
#And Evaluate
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Training R²: {train_score:.3f}")
print(f"Testing R²: {test_score:.3f}")

Training R²: 1.000
Testing R²: 1.000


In [42]:
# Now lets Save the entire pipeline (including preprocessing)
joblib.dump(model, 'pumpkins_price_predictor.pkl')

#i start with saving just the model coefficients for inspection:
coefficients = pd.DataFrame({
    'feature': numerical_features + 
               list(model.named_steps['preprocessor']
                   .named_transformers_['cat']
                   .get_feature_names_out(categorical_features)),
    'coefficient': model.named_steps['regressor'].coef_
})
coefficients.to_csv('model_coefficients.csv', index=False)

In [43]:
#Now if i am to use the model later
import joblib
import pandas as pd

In [44]:
# we Load the saved model First
model = joblib.load('pumpkins_price_predictor.pkl')

In [45]:
def predict_price(input_data):
    """Predict Pumpkins price from input dictionary"""
    # Convert to DataFrame
    input_df = pd.DataFrame([input_data])
    
    # Make prediction
    price = model.predict(input_df)[0]
    return price

In [46]:
# Lets try
if __name__ == "__main__":
    # lets try some inputs
    New_Pumpkin = {
        'Package': '24 inch bins',
        'Low Price': 260,
        'High Price': 287,
        'Variety': 'MINIATURE',
        'Item Size': 'sml', 
        'Color': 'White'
    }
    
    predicted_price = predict_price(New_Pumpkin)
    print(f"Predicted Price: ${predicted_price:,.2f}")

Predicted Price: $201.75
