In [21]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression   # ✅ this line is needed


In [17]:
# Load data
df = pd.read_csv("C:\\Users\\93810\\Downloads\\regression test.csv")

# Define X (independent) and y (dependent)
X = df[["hatchery", "hatcher", "setter", "DriverName", "VehicleNumber", "source_of_eggs", "CustomerType"]]
y = df["FirstWeekMortality"]

# Preprocess categorical columns (OneHotEncoding)
categorical_cols = X.columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)]
)

# Regression model
model = Pipeline(steps=[("preprocessor", preprocessor),
                        ("regressor", LinearRegression())])

# Fit on entire dataset (8 weeks)
model.fit(X, y)

# Extract coefficients
encoded_features = model.named_steps["preprocessor"].named_transformers_["cat"].get_feature_names_out(categorical_cols)
coef_df = pd.DataFrame({"feature": encoded_features, "coefficient": model.named_steps["regressor"].coef_})

# Sort by impact
coef_df["abs_coef"] = coef_df["coefficient"].abs()
coef_df = coef_df.sort_values(by="abs_coef", ascending=False)
print(coef_df.head(10))  # top drivers

# For last week only
last_week = df[df["week"] == df["week"].max()]
X_last = last_week[categorical_cols]
y_last = last_week["first_week_mortality"]

model.fit(X_last, y_last)
last_coef = pd.DataFrame({"feature": encoded_features, "coefficient": model.named_steps["regressor"].coef_})
last_coef["abs_coef"] = last_coef["coefficient"].abs()
print(last_coef.sort_values(by="abs_coef", ascending=False).head(10))


KeyError: 'FirstWeekMortality'

In [28]:


# Load data
df = pd.read_csv("C:\\Users\\93810\\Downloads\\regression test.csv")

# Clean dependent variable: remove % and convert to float
df["first_week_mortality"] = df["first_week_mortality"].str.replace("%", "").astype(float)
# Drop rows where either X or y is missing
df = df.dropna(subset=["hatcher", "first_week_mortality"])

print(df["hatcher"].unique())
print(df["hatcher"].nunique())

# Select X and y
X = df[["hatcher"]]   # independent
y = df["first_week_mortality"]  # dependent
print(X.head())          # original hatchery column
print(X_encoded.head())  # after dummy encoding
print(y.head())          # dependent variable
print(X_encoded.shape, y.shape)


# Convert categorical (hatchery) into dummy variables
X_encoded = pd.get_dummies(X, drop_first=True)

model = LinearRegression()
model.fit(X_encoded, y)

coefficients = pd.DataFrame({
    "Feature": X_encoded.columns,
    "Coefficient": model.coef_
})
print("Intercept:", model.intercept_)
print(coefficients)


['H9' 'H5' 'H7' 'H6' 'H4' 'H2' 'H8']
7
  hatcher
0      H9
1      H5
2      H7
3      H9
4      H5
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]
0    34.19
1    15.69
2    11.03
3     9.80
4     9.80
Name: first_week_mortality, dtype: float64
(492, 0) (492,)
Intercept: 0.9779687500000003
      Feature  Coefficient
0  hatcher_H4    -0.058814
1  hatcher_H5     0.986342
2  hatcher_H6     0.533936
3  hatcher_H7     0.268698
4  hatcher_H8     0.697746
5  hatcher_H9     0.407856
