In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the dataset
file_path = 'AirQualityUCI.xlsx'  # Update this path if needed
data = pd.read_excel(file_path)

In [3]:
# Drop columns with all NaN values
data.dropna(axis=1, how='all', inplace=True)

In [5]:
# Replace ',' with '.' in numerical columns and convert to float
for col in data.columns:
    if data[col].dtype in ['int64', 'float64']:  # Check for numeric columns
        data[col] = data[col].apply(lambda x: float(str(x).replace(',', '.')) if isinstance(x, str) else x)
    # Optionally, you could also handle other numeric types like 'int32' or additional types as needed.


In [6]:
# Drop rows with any NaN values
data.dropna(inplace=True)


In [7]:
# Select features and target
# For example, let's predict CO(GT) based on other sensor readings
features = ['PT08.S1(CO)', 'PT08.S2(NMHC)', 'PT08.S3(NOx)', 'PT08.S4(NO2)', 'PT08.S5(O3)']
target = 'CO(GT)'

In [8]:
X = data[features]
y = data[target]

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [10]:
# Make predictions
y_pred = model.predict(X_test)

In [11]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [12]:
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 5578.584283324195
R^2 Score: 0.055776880387623406


In [13]:
# Display the coefficients
coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coefficients)

               Coefficient
PT08.S1(CO)       0.050902
PT08.S2(NMHC)     0.023331
PT08.S3(NOx)     -0.008762
PT08.S4(NO2)     -0.065336
PT08.S5(O3)       0.011765
