In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e5/sample_submission.csv
/kaggle/input/playground-series-s3e5/train.csv
/kaggle/input/playground-series-s3e5/test.csv


In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb

In [4]:
traindata=pd.read_csv("/kaggle/input/playground-series-s3e5/train.csv")
testdata=pd.read_csv("/kaggle/input/playground-series-s3e5/test.csv")
traindata.head(5)

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,6
1,1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,6
2,2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3,7
3,3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8,5
4,4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5,6


In [5]:
traindata.isna().sum()

Id                      0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [34]:
X=traindata.drop(['Id','quality','citric acid'],axis=1)
y=traindata['quality']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Function to evaluate and print results
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n{model_name}:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"R2 Score: {r2:.4f}")

    return {'MSE': mse, 'MAE': mae, 'R2': r2}

In [24]:
# Random Forest Regressor
RandomForestModel = RandomForestRegressor(random_state=42)
RandomForestModel.fit(X_train_scaled, y_train)
RF_y_pred = RandomForestModel.predict(X_test_scaled)
rf_results = evaluate_model(y_test, RF_y_pred, "Random Forest")


Random Forest:
Mean Squared Error: 0.5159
Mean Absolute Error: 0.5425
R2 Score: 0.2479


In [25]:
# Gradient Boosting Regressor
GradientBoostingModel = GradientBoostingRegressor(random_state=42)
GradientBoostingModel.fit(X_train_scaled, y_train)
GB_y_pred = GradientBoostingModel.predict(X_test_scaled)
gb_results = evaluate_model(y_test, GB_y_pred, "Gradient Boosting")


Gradient Boosting:
Mean Squared Error: 0.4935
Mean Absolute Error: 0.5250
R2 Score: 0.2805


In [13]:
# Decision Tree Regressor
DecisionTreeModel = DecisionTreeRegressor(random_state=42)
DecisionTreeModel.fit(X_train_scaled, y_train)
DT_y_pred = DecisionTreeModel.predict(X_test_scaled)
dt_results = evaluate_model(y_test, DT_y_pred, "Decision Tree")



Decision Tree:
Mean Squared Error: 0.9830
Mean Absolute Error: 0.6966
R2 Score: -0.4330


In [14]:
# K-Nearest Neighbors Regressor
KNNModel = KNeighborsRegressor()
KNNModel.fit(X_train_scaled, y_train)
KNN_y_pred = KNNModel.predict(X_test_scaled)
knn_results = evaluate_model(y_test, KNN_y_pred, "KNN")


KNN:
Mean Squared Error: 0.6230
Mean Absolute Error: 0.6053
R2 Score: 0.0918


In [15]:
# XGBoost Regressor
XGBoostModel = XGBRegressor(random_state=42)
XGBoostModel.fit(X_train_scaled, y_train)
XGB_y_pred = XGBoostModel.predict(X_test_scaled)
xgb_results = evaluate_model(y_test, XGB_y_pred, "XGBoost")


XGBoost:
Mean Squared Error: 0.5700
Mean Absolute Error: 0.5711
R2 Score: 0.1691


In [16]:
# CatBoost Regressor
CatBoostModel = CatBoostRegressor(random_state=42, verbose=0)
CatBoostModel.fit(X_train_scaled, y_train)
CatBoost_y_pred = CatBoostModel.predict(X_test_scaled)
catboost_results = evaluate_model(y_test, CatBoost_y_pred, "CatBoost")


CatBoost:
Mean Squared Error: 0.5241
Mean Absolute Error: 0.5441
R2 Score: 0.2360


In [37]:
# Initialize and train the LightGBM model
lgbm_model = lgb.LGBMRegressor()
lgbm_model.fit(X_train_scaled, y_train)

# Make predictions with the LightGBM model
lgbm_y_pred = lgbm_model.predict(X_test_scaled)
lgbm_results = evaluate_model(y_test, lgbm_y_pred, "lgbm_model")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000328 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 821
[LightGBM] [Info] Number of data points in the train set: 1644, number of used features: 10
[LightGBM] [Info] Start training from score 5.717762

lgbm_model:
Mean Squared Error: 0.5585
Mean Absolute Error: 0.5613
R2 Score: 0.1858


In [17]:
# Voting Regressor
VotingModel = VotingRegressor([
    ('rf', RandomForestModel),
    ('gb', GradientBoostingModel),
    ('dt', DecisionTreeModel),
    ('knn', KNNModel),
    ('xgb', XGBoostModel),
    ('catboost', CatBoostModel)
])
VotingModel.fit(X_train_scaled, y_train)
Voting_y_pred = VotingModel.predict(X_test_scaled)
voting_results = evaluate_model(y_test, Voting_y_pred, "Voting Regressor")


Voting Regressor:
Mean Squared Error: 0.5113
Mean Absolute Error: 0.5470
R2 Score: 0.2547


In [39]:
# Round the predictions to the nearest integer and convert to int
test_predictions = lgbm_model.predict(testdata.drop(['Id','citric acid'], axis=1))

# Round predictions to the nearest integer
test_predictions_rounded = test_predictions.round().astype(int)

# Create submission file with rounded predictions
submission = pd.DataFrame({'Id': testdata['Id'], 'quality': test_predictions_rounded})
submission.to_csv('submission.csv', index=False)

print("\nSubmission file created successfully with rounded predictions!")



Submission file created successfully with rounded predictions!
