In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor


from sklearn.metrics import mean_squared_error, r2_score, make_scorer

In [5]:
df = pd.read_csv("X_train.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carID         4960 non-null   int64  
 1   brand         4960 non-null   object 
 2   model         4960 non-null   object 
 3   year          4960 non-null   int64  
 4   transmission  4960 non-null   object 
 5   mileage       4960 non-null   int64  
 6   fuelType      4960 non-null   object 
 7   tax           4960 non-null   float64
 8   mpg           4960 non-null   float64
 9   engineSize    4960 non-null   float64
dtypes: float64(3), int64(3), object(4)
memory usage: 387.6+ KB


In [7]:
df.head()


Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,13207,hyundi,Santa Fe,2019,Semi-Auto,4223,Diesel,145.0,39.8,2.2
1,17314,vauxhall,GTC,2015,Manual,47870,Diesel,125.0,60.1,2.0
2,12342,audi,RS4,2019,Automatic,5151,Petrol,145.0,29.1,2.9
3,13426,vw,Scirocco,2016,Automatic,20423,Diesel,30.0,57.6,2.0
4,16004,skoda,Scala,2020,Semi-Auto,3569,Petrol,145.0,47.1,1.0


In [8]:
df['mpg'].value_counts().nunique()


66

In [9]:
for col in df.select_dtypes(include='object').columns:
  cardinality = df[col].nunique()
  le = LabelEncoder()
  if cardinality >= 5:
    df[col] = le.fit_transform(df[col])
  else:
    df = pd.get_dummies(df, columns=[col], dtype=int, drop_first=True)

In [10]:
num_col = df.select_dtypes(include='number').columns
scaler =  StandardScaler()
df[num_col] = scaler.fit_transform(df[num_col])

In [11]:
df.head()


Unnamed: 0,carID,brand,model,year,mileage,fuelType,tax,mpg,engineSize,transmission_Manual,transmission_Other,transmission_Semi-Auto
0,-1.189872,-0.385569,0.962707,0.78443,-0.848304,-0.79557,-0.088993,-0.295744,0.095814,-0.804205,-0.0142,1.634428
1,0.671451,1.19527,-0.526434,-0.602655,0.937516,-0.79557,-0.331725,0.2722,-0.157579,1.243463,-0.0142,-0.611835
2,-1.581897,-1.571198,0.542693,0.78443,-0.810335,1.316299,-0.088993,-0.595105,0.982689,-0.804205,-0.0142,-0.611835
3,-1.09062,1.59048,1.039074,-0.255884,-0.18548,-0.79557,-1.4847,0.202256,-0.157579,-0.804205,-0.0142,-0.611835
4,0.077749,0.404851,1.00089,1.131202,-0.875062,1.316299,-0.088993,-0.091508,-1.424542,-0.804205,-0.0142,1.634428


In [12]:
x = df.drop(columns=['mpg'])
y = df['mpg']

x_train, x_temp, y_train, y_temp = train_test_split(x,y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [13]:
model = LGBMRegressor()

In [14]:
param_distribution={
  'max_depth':[None, 5,10,20,20],
  'min_samples_split':[2,5,8],
  'min_samples_leaf':[1,3,5],
  'max_features':[None,'sqrt','log2']
}

In [15]:
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

In [16]:
random_search = RandomizedSearchCV(
  estimator=model,
  param_distributions=param_distribution,
  n_iter=20,
  cv=5,
  n_jobs=-1,
  refit='r2',
  random_state=42
)

In [17]:
lgb_model = random_search.fit(x_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 705
[LightGBM] [Info] Number of data points in the train set: 3968, number of used features: 10
[LightGBM] [Info] Start training from score 0.009324


In [18]:
y_pred = lgb_model.predict(x_test)



In [19]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [20]:
print(f"Mse: {mse}, r2: {r2}")

Mse: 0.04124654442478441, r2: 0.9598801226840008
