In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("goat_dataset.csv")

## Exploratory Data Analysis

In [3]:
df.drop(["owner_name" , "adhaar_number" , "goat_id"] , axis = 1 , inplace = True)

In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [5]:
numerical_columns = ["milk_in_litres" , "age" , "height" , "weight" , "hay_grass_intake"]
categorical_columns = ["pregnancy" , "behavior" , "gender"]

In [6]:
numerical_pipeline = Pipeline(
    steps = [
        ("imputer" , SimpleImputer(strategy = "median")),
        ("scaler" , StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("imputer" , SimpleImputer(strategy = "most_frequent")),
        ("one_hot_encoder" , OneHotEncoder()),
        ("scaler" , StandardScaler(with_mean = False))
    ]
)

preprocessor = ColumnTransformer(
                [
                    ("numerical_pipeline",numerical_pipeline,numerical_columns),
                    ("categorical_pipeline",categorical_pipeline,categorical_columns)
                ]
)

## meat_quality_of_the_milk

In [7]:
x = df.drop(["meat_quality_of_the_goat" , "milk_quality_of_the_goat"] , axis = 1 )
y = df[["meat_quality_of_the_goat"]]

In [8]:
from sklearn.model_selection import train_test_split

# Assuming X and y are your features and labels respectively
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(f"Training set size: {x_train.shape[0]}")
print(f"Test set size: {x_test.shape[0]}")


Training set size: 8
Test set size: 2


In [9]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

## Model Building

### Giving every model a chance along with grid search cv implementation

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [11]:
X_train = x_train
X_test = x_test

In [12]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(y_train_pred)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

[[5.]
 [4.]
 [4.]
 [5.]
 [2.]
 [4.]
 [2.]
 [3.]]
Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5024
- Mean Absolute Error: 0.4202
- R2 Score: 0.7476


[3.625 3.625 3.625 3.625 3.625 3.625 3.625 3.625]
Lasso
Model performance for Training set
- Root Mean Squared Error: 1.1110
- Mean Absolute Error: 0.9688
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.0680
- Mean Absolute Error: 1.0000
- R2 Score: -0.1406


[[5.06111679]
 [3.71920894]
 [3.88464735]
 [4.84073131]
 [2.23584611]
 [4.07576491]
 [2.11322981]
 [3.06945478]]
Ridge
Model performance for Training set
- Root Mean Squared Error: 0.1582
- Mean Absolute Error: 0.1389
- R2 Score: 0.9797
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.3954
-

  model.fit(X_train, y_train) # Train model


[4.999411  3.9988391 4.000277  4.999411  2.000416  4.000034  2.000416
 3.0011985]
XGBRegressor
Model performance for Training set
- Root Mean Squared Error: 0.0007
- Mean Absolute Error: 0.0006
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.5300
- Mean Absolute Error: 0.5010
- R2 Score: 0.7191


[5. 4. 4. 5. 2. 4. 2. 3.]
AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.7071
- Mean Absolute Error: 0.5000
- R2 Score: 0.5000




  y = column_or_1d(y, warn=True)


In [13]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.843636
0,Linear Regression,0.747552
6,XGBRegressor,0.719119
4,Decision Tree,0.5
7,AdaBoost Regressor,0.5
5,Random Forest Regressor,0.38175
3,K-Neighbors Regressor,0.18
1,Lasso,-0.140625


## KNN Performs the Best

In [14]:
    model = KNeighborsRegressor(n_neighbors=5)
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(y_train_pred)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

[[4. ]
 [3. ]
 [3.4]
 [4. ]
 [3.4]
 [4.2]
 [3.4]
 [3.6]]
AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 0.9798
- Mean Absolute Error: 0.9000
- R2 Score: 0.2223


In [15]:
import pickle

In [16]:
with open('meat_quality_model.pkl', 'wb') as file:
    pickle.dump(model, file)