In [None]:
# Logistic Regression on US-pumpkins dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load your dataset
df = pd.read_csv("US-pumpkins.csv")
print(df.head(5))

# lets check for missing values
df.isnull().sum()

so its obvious from the above, we have only 8 columns with less than 5% missing-data; all the rest have a lot of missing values. so we can drop the rest using dropna()

In [None]:
# dropping unnecessary columns: we first create a list of useful columns; 'useful_cols'

useful_cols = ['City Name', 'Package', 'Date', 'Low Price', 'High Price', 'Repack']
for c in df.columns:
    if c not in useful_cols:
        df = df.drop(columns=c)
print(df)

df.isnull().sum()


So we have no missing data in our entire DataFrame. Now let's convert Date to datetime

In [None]:
# lets Convert Date to datetime and extract features
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

df['Date'].head()


In [None]:
# Now lets encode our target variable. We first instantiate LabelEncoder() and then fit_transform() on our target column
label_enc = LabelEncoder()
df['Repack'] = label_enc.fit_transform(df['Repack'])

# One-hot encode categorical variables (excluding target)
categorical_cols = ['City Name', 'Package', 'Date', 'Low Price', 'High Price',]
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print(df_encoded.head())


In [21]:
# Split features (X) and target (y)
X = df_encoded.drop(columns=['Repack'])
y = df_encoded['Repack']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

We have now succesfully trained our dataset. Lets now perform logistic regression and evaluate some metric scores

In [23]:
# Fit logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Evaluate
y_pred = log_reg.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[  0   1]
 [  0 351]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      1.00      1.00       351

    accuracy                           1.00       352
   macro avg       0.50      0.50      0.50       352
weighted avg       0.99      1.00      1.00       352



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [24]:
df = pd.read_csv("cleaned_cuisines.csv")
df.shape

(3995, 382)