In [1]:
# Regression Models
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVR
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_classification
from xgboost import XGBRegressor,XGBRFRegressor
from sklearn.neighbors import KNeighborsRegressor,KNeighborsTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.feature_selection import SelectKBest, f_regression, RFE, RFECV
from sklearn.preprocessing import LabelEncoder, StandardScaler ,OneHotEncoder, MinMaxScaler, PolynomialFeatures

from sklearn.model_selection import (train_test_split, KFold, cross_val_score, GridSearchCV, StratifiedKFold, learning_curve,
cross_val_predict, cross_validate,permutation_test_score,validation_curve,StratifiedKFold,RandomizedSearchCV)

from sklearn.metrics import (accuracy_score, cohen_kappa_score, confusion_matrix, mean_squared_error, r2_score, 
root_mean_squared_error, recall_score, roc_auc_score, roc_curve, mean_absolute_error, auc)

from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor,
VotingRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor, StackingRegressor)

In [2]:
df = pd.read_csv('airquality.csv')
df

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
0,41.0,190.0,7.4,67,5,1
1,36.0,118.0,8.0,72,5,2
2,12.0,149.0,12.6,74,5,3
3,18.0,313.0,11.5,62,5,4
4,,,14.3,56,5,5
...,...,...,...,...,...,...
148,30.0,193.0,6.9,70,9,26
149,,145.0,13.2,77,9,27
150,14.0,191.0,14.3,75,9,28
151,18.0,131.0,8.0,76,9,29


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Ozone    116 non-null    float64
 1   Solar.R  146 non-null    float64
 2   Wind     153 non-null    float64
 3   Temp     153 non-null    int64  
 4   Month    153 non-null    int64  
 5   Day      153 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 7.3 KB


In [4]:
df.isna().sum()

Ozone      37
Solar.R     7
Wind        0
Temp        0
Month       0
Day         0
dtype: int64

In [27]:
#df['Ozone'].median()

In [5]:
df['Ozone'] = round(df['Ozone'].fillna(df['Ozone'].median()),2)
df['Solar.R'] = df['Solar.R'].fillna(df['Solar.R'].median())

In [6]:
df.head()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
0,41.0,190.0,7.4,67,5,1
1,36.0,118.0,8.0,72,5,2
2,12.0,149.0,12.6,74,5,3
3,18.0,313.0,11.5,62,5,4
4,31.5,205.0,14.3,56,5,5


## LGBMRegressor

In [7]:
X = df.drop('Temp', axis=1)
y = df['Temp']
X_train_lgb, X_test_lgb, y_train_lgb, y_test_lgb = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
clf = LGBMRegressor()
clf.fit(X_train_lgb, y_train_lgb)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 122, number of used features: 5
[LightGBM] [Info] Start training from score 78.049180


In [9]:
X_test_lgb.head()

Unnamed: 0,Ozone,Solar.R,Wind,Month,Day
84,80.0,294.0,8.6,7,24
86,20.0,81.0,8.6,7,26
97,66.0,205.0,4.6,8,6
115,45.0,212.0,9.7,8,24
29,115.0,223.0,5.7,5,30


In [10]:
pred = clf.predict(X_test_lgb)
pred

array([86.21549076, 70.79150499, 86.90922178, 78.78296083, 74.51240475,
       77.21381912, 81.23481604, 74.15201924, 65.86081422, 66.3401776 ,
       62.23808683, 68.96767721, 71.39596194, 83.73049659, 79.42443416,
       89.04837967, 75.82492356, 75.84361719, 85.93804958, 86.16969711,
       72.43459108, 90.19787326, 81.58056308, 84.62737002, 76.32923471,
       71.68625639, 71.49529571, 77.03094924, 82.08561657, 79.08686986,
       71.16962511])

In [11]:
clf.score(X_test_lgb, y_test_lgb)

0.7119649369720817

In [14]:
predictions = pd.DataFrame(X_test_lgb)
predictions['Predicted_Temperature'] = pred.round(2)
predictions.head()

Unnamed: 0,Ozone,Solar.R,Wind,Month,Day,Predicted_Temperature
84,80.0,294.0,8.6,7,24,86.22
86,20.0,81.0,8.6,7,26,70.79
97,66.0,205.0,4.6,8,6,86.91
115,45.0,212.0,9.7,8,24,78.78
29,115.0,223.0,5.7,5,30,74.51


## XGBoostRegressor

In [15]:
X = df.drop('Temp', axis=1)
y = df['Temp']
X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(X, y, test_size=0.2, random_state=42)
xgboost = XGBRegressor()
xgboost.fit(X_train_xgb, y_train_xgb)

In [16]:
xgboost.score(X_test_xgb, y_test_xgb)

0.6473846435546875

In [17]:
pred_xgb = xgboost.predict(X_test_xgb)
pred_xgb

array([87.18902 , 70.31855 , 81.83512 , 79.3342  , 80.12228 , 82.42776 ,
       81.28243 , 72.97087 , 69.909096, 72.96817 , 58.689884, 66.95644 ,
       65.677475, 85.466705, 77.71264 , 91.265465, 76.88148 , 78.3368  ,
       82.8886  , 85.65537 , 72.87533 , 90.88009 , 75.45803 , 87.44893 ,
       78.2009  , 72.00343 , 70.51311 , 76.44167 , 79.5062  , 85.22546 ,
       69.17322 ], dtype=float32)

In [23]:
xgb = pd.DataFrame(X_test_xgb)
xgb['Predicted_Temperature'] = np.round(pred_xgb, 2)
xgb.head()

Unnamed: 0,Ozone,Solar.R,Wind,Month,Day,Predicted_Temperature
84,80.0,294.0,8.6,7,24,87.190002
86,20.0,81.0,8.6,7,26,70.32
97,66.0,205.0,4.6,8,6,81.839996
115,45.0,212.0,9.7,8,24,79.330002
29,115.0,223.0,5.7,5,30,80.120003


## AdaBoostRegressor

In [24]:
X = df.drop('Temp', axis=1)
y = df['Temp']
X_train_ada, X_test_ada, y_train_ada, y_test_ada = train_test_split(X, y, test_size=0.2, random_state=42)
adaboost = AdaBoostRegressor()
adaboost.fit(X_train_ada, y_train_ada)

In [25]:
adaboost.score(X_test_ada, y_test_ada)

0.7062763904159084

In [26]:
ada_pred = adaboost.predict(X_test_ada)
ada_pred

array([86.63076923, 72.42857143, 86.09836066, 85.01818182, 81.        ,
       80.953125  , 83.76923077, 73.43333333, 66.30769231, 68.27272727,
       65.56666667, 69.64516129, 68.15384615, 83.4       , 81.        ,
       87.46575342, 75.61538462, 75.        , 85.51851852, 85.        ,
       71.71428571, 87.83636364, 81.75      , 85.57575758, 80.953125  ,
       75.92156863, 68.33333333, 77.07142857, 79.4       , 83.4       ,
       69.94117647])

In [28]:
adab_df = pd.DataFrame(X_test_ada)
adab_df['Predicted_Temperature'] = ada_pred.round(2) 
adab_df.head()

Unnamed: 0,Ozone,Solar.R,Wind,Month,Day,Predicted_Temperature
84,80.0,294.0,8.6,7,24,86.63
86,20.0,81.0,8.6,7,26,72.43
97,66.0,205.0,4.6,8,6,86.1
115,45.0,212.0,9.7,8,24,85.02
29,115.0,223.0,5.7,5,30,81.0


## RandomForestRegressor

In [51]:
X = df.drop('Temp', axis=1)
y = df['Temp']
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.2, random_state=0)
rf_reg = RandomForestRegressor(n_estimators=20)
rf_reg.fit(X_train_rf, y_train_rf)

In [52]:
rf_reg.score(X_test_rf, y_test_rf)

0.7004076929394132

In [53]:
rf_pred = rf_reg.predict(X_test_rf)
rf_pred

array([65.95, 80.35, 81.  , 85.3 , 62.65, 60.85, 85.4 , 84.05, 73.  ,
       79.05, 70.65, 80.4 , 80.55, 75.45, 81.05, 81.35, 80.65, 80.55,
       90.25, 87.35, 75.95, 61.4 , 77.8 , 83.85, 81.35, 89.6 , 77.75,
       77.25, 83.35, 79.15, 66.9 ])

In [54]:
rf_model = pd.DataFrame(X_test_rf)
rf_model['Predicted_Temperature'] = rf_pred.round(2)
rf_model.head()

Unnamed: 0,Ozone,Solar.R,Wind,Month,Day,Predicted_Temperature
26,31.5,205.0,8.0,5,27,65.95
135,28.0,238.0,6.3,9,13,80.35
63,32.0,236.0,9.2,7,3,81.0
105,65.0,157.0,9.7,8,14,85.3
24,31.5,66.0,16.6,5,25,62.65


## The best model is Lightgbm

In [55]:
# Check the data types and unique values
print(f"y_test_light type: {type(y_test_lgb)}")
print(f"light_pred type: {type(pred)}\n")

# Look at some samples
print("First few values of y_test_rf:", y_test_lgb[:5])
print("First few values of rf_pred:", pred[:5], '\n')

# Check for any continuous values
print("Are there any float values in y_test_rf?", any(isinstance(x, float) and not x.is_integer() for x in y_test_lgb))
print("Are there any float values in rf_pred?", any(isinstance(x, float) and not x.is_integer() for x in pred),'\n')

# If needed, convert to integer class labels
#y_test_rf_fixed = np.round(y_test_rf).astype(int)
#rf_pred_fixed = np.round(rf_pred).astype(int)

# Try the classification report again
#print(classification_report(y_test_rf_fixed, rf_pred_fixed))

y_test_light type: <class 'pandas.core.series.Series'>
light_pred type: <class 'numpy.ndarray'>

First few values of y_test_rf: 84     86
86     82
97     87
115    79
29     79
Name: Temp, dtype: int64
First few values of rf_pred: [86.21549076 70.79150499 86.90922178 78.78296083 74.51240475] 

Are there any float values in y_test_rf? False
Are there any float values in rf_pred? True 



In [56]:
print(type(y_test_rf))
print(type(rf_pred))
print(y_test_rf.dtype if hasattr(y_test_rf, 'dtype') else "N/A")
print(rf_pred.dtype if hasattr(rf_pred, 'dtype') else "N/A")

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
int64
float64
