<h2> Importing the Libraries

In [2]:
import pandas as pd
import numpy as np

<h2> Loading and Inspecting the Data

In [3]:
# Load the dataset
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,index,beer/ABV,beer/beerId,beer/brewerId,beer/name,beer/style,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text,review/timeStruct,review/timeUnix,user/ageInSeconds,user/birthdayRaw,user/birthdayUnix,user/gender,user/profileName
0,40163,5.0,46634,14338,Chiostro,Herbed / Spiced Beer,4.0,4.0,4.0,4.0,4.0,Pours a clouded gold with a thin white head. N...,"{'min': 38, 'hour': 3, 'mday': 16, 'sec': 10, ...",1229398690,,,,,RblWthACoz
1,8135,11.0,3003,395,Bearded Pat's Barleywine,American Barleywine,4.0,3.5,3.5,3.5,3.0,12oz bottle into 8oz snifter.\t\tDeep ruby red...,"{'min': 38, 'hour': 23, 'mday': 8, 'sec': 58, ...",1218238738,,,,,BeerSox
2,10529,4.7,961,365,Naughty Nellie's Ale,American Pale Ale (APA),3.5,4.0,3.5,3.5,3.5,First enjoyed at the brewpub about 2 years ago...,"{'min': 7, 'hour': 18, 'mday': 26, 'sec': 2, '...",1101492422,,,,Male,mschofield
3,44610,4.4,429,1,Pilsner Urquell,Czech Pilsener,3.0,3.0,2.5,3.0,3.0,First thing I noticed after pouring from green...,"{'min': 7, 'hour': 1, 'mday': 20, 'sec': 5, 'y...",1308532025,1209827000.0,"Aug 10, 1976",208508400.0,Male,molegar76
4,37062,4.4,4904,1417,Black Sheep Ale (Special),English Pale Ale,4.0,3.0,3.0,3.5,2.5,A: pours an amber with a one finger head but o...,"{'min': 51, 'hour': 6, 'mday': 12, 'sec': 48, ...",1299912708,,,,,Brewbro000


In [4]:
# Display basic information
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37500 entries, 0 to 37499
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              37500 non-null  int64  
 1   beer/ABV           37500 non-null  float64
 2   beer/beerId        37500 non-null  int64  
 3   beer/brewerId      37500 non-null  int64  
 4   beer/name          37500 non-null  object 
 5   beer/style         37500 non-null  object 
 6   review/appearance  37500 non-null  float64
 7   review/aroma       37500 non-null  float64
 8   review/overall     37500 non-null  float64
 9   review/palate      37500 non-null  float64
 10  review/taste       37500 non-null  float64
 11  review/text        37490 non-null  object 
 12  review/timeStruct  37500 non-null  object 
 13  review/timeUnix    37500 non-null  int64  
 14  user/ageInSeconds  7856 non-null   float64
 15  user/birthdayRaw   7856 non-null   object 
 16  user/birthdayUnix  785

<h2> Preprocessing the Data

In [5]:
# Drop rows with missing target or critical features
data = data.dropna(subset=['review/overall', 'review/text'])

In [6]:
# Fill missing values
data['beer/ABV'].fillna(data['beer/ABV'].mean(), inplace=True)
data['beer/style'].fillna('Unknown', inplace=True)
data['review/text'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['beer/ABV'].fillna(data['beer/ABV'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['beer/ABV'].fillna(data['beer/ABV'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or 

<h2> Feature Engineering

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Select features and target
X = data[['beer/ABV', 'beer/style', 'review/aroma', 'review/palate', 'review/taste', 'review/text']]
y = data['review/overall']

# ColumnTransformer to handle mixed data types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['beer/ABV', 'review/aroma', 'review/palate', 'review/taste']),
        ('cat', OneHotEncoder(), ['beer/style']),
        ('text', TfidfVectorizer(max_features=500), 'review/text')
    ]
)

In [8]:
# Create a Pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

# Pipelines for different models
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())])

lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())])

svr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SVR())])

<h2> Train-Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
# Train the Models :-

# Random Forest
rf_pipeline.fit(X_train, y_train)

In [13]:
# Linear Regression
lr_pipeline.fit(X_train, y_train)

In [15]:
# Support Vector Machine
svr_pipeline.fit(X_train, y_train)

In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Random Forest
rf_pred = rf_pipeline.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

# Linear Regression
lr_pred = lr_pipeline.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)
lr_mae = mean_absolute_error(y_test, lr_pred)
lr_r2 = r2_score(y_test, lr_pred)

# Support Vector Regressor
svr_pred = svr_pipeline.predict(X_test)
svr_mse = mean_squared_error(y_test, svr_pred)
svr_mae = mean_absolute_error(y_test, svr_pred)
svr_r2 = r2_score(y_test, svr_pred)

# Print results
print(f"Random Forest MSE: {rf_mse}")
print(f"Random Forest MAE: {rf_mae}")
print(f"Random Forest R²: {rf_r2}")

print(f"Linear Regression MSE: {lr_mse}")
print(f"Linear Regression MAE: {lr_mae}")
print(f"Linear Regression R²: {lr_r2}")

print(f"SVR MSE: {svr_mse}")
print(f"SVR MAE: {svr_mae}")
print(f"SVR R²: {svr_r2}")


Random Forest MSE: 0.16294632902107228
Random Forest MAE: 0.3078534275806882
Random Forest R²: 0.6823147885386154
Linear Regression MSE: 0.15667385999962138
Linear Regression MAE: 0.3044093203873547
Linear Regression R²: 0.6945437884764227
SVR MSE: 0.15424988731815623
SVR MAE: 0.2991233084282617
SVR R²: 0.69926964071571
Random Forest MSE: 0.16294632902107228
Random Forest MAE: 0.3078534275806882
Random Forest R²: 0.6823147885386154
Linear Regression MSE: 0.15667385999962138
Linear Regression MAE: 0.3044093203873547
Linear Regression R²: 0.6945437884764227
SVR MSE: 0.15424988731815623
SVR MAE: 0.2991233084282617
SVR R²: 0.69926964071571
