In [None]:
from google.colab import files
import pandas as pd

uploaded = files.upload()
# Load training data
train = pd.read_csv("train.csv")

# Preview
train.head()

# Check info and missing values
train.info()
train.isna().sum()

Saving train.csv to train.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   1000 non-null   int64  
 1   Building Type        1000 non-null   object 
 2   Square Footage       1000 non-null   int64  
 3   Number of Occupants  1000 non-null   int64  
 4   Appliances Used      1000 non-null   int64  
 5   Average Temperature  1000 non-null   float64
 6   Day of Week          1000 non-null   object 
 7   Energy Consumption   1000 non-null   float64
dtypes: float64(2), int64(4), object(2)
memory usage: 62.6+ KB


Unnamed: 0,0
ID,0
Building Type,0
Square Footage,0
Number of Occupants,0
Appliances Used,0
Average Temperature,0
Day of Week,0
Energy Consumption,0


In [None]:
# Step 2. Preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Separate features and target
X = train.drop(columns=["Energy Consumption"])
y = train["Energy Consumption"]

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# One-hot encode categorical columns
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = pd.DataFrame(ohe.fit_transform(X[categorical_cols]), columns=ohe.get_feature_names_out(categorical_cols))

# Scale numerical columns
scaler = StandardScaler()
X_num = pd.DataFrame(scaler.fit_transform(X[numerical_cols]), columns=numerical_cols)

# Combine all processed columns
X_processed = pd.concat([X_num, X_cat], axis=1)

# Display processed features
X_processed.head()

Unnamed: 0,ID,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Building Type_Commercial,Building Type_Industrial,Building Type_Residential,Day of Week_Weekday,Day of Week_Weekend
0,-1.73032,-1.287805,0.951134,-1.106957,1.012925,0.0,0.0,1.0,1.0,0.0
1,-1.726856,1.323516,0.606869,1.375645,-0.825544,1.0,0.0,0.0,1.0,0.0
2,-1.723391,-0.434466,-0.391498,-0.610436,-1.164652,0.0,1.0,0.0,0.0,1.0
3,-1.719927,-0.853716,-1.183306,1.091919,1.430504,0.0,0.0,1.0,1.0,0.0
4,-1.716463,-0.846017,-0.770188,-0.539505,-1.498155,1.0,0.0,0.0,1.0,0.0


In [None]:
# Step 3. Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (800, 10) Test: (200, 10)


In [None]:
# Step 4. Model Training
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Step 5. Model Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    results = pd.DataFrame([[mae, mse, rmse, r2]], columns=["MAE", "MSE", "RMSE", "R2"])
    return results

evaluate_model(y_test, y_pred)


Unnamed: 0,MAE,MSE,RMSE,R2
0,0.011425,0.000188,0.013728,1.0


In [None]:
# Step 6. Prediction
from google.colab import files
uploaded = files.upload()
# Load and preprocess test data
test = pd.read_csv('test.csv')

# Preprocess test features (same as training)
X_test_cat = pd.DataFrame(ohe.transform(test[categorical_cols]), columns=ohe.get_feature_names_out(categorical_cols))
X_test_num = pd.DataFrame(scaler.transform(test[numerical_cols]), columns=numerical_cols)
X_test_final = pd.concat([X_test_num, X_test_cat], axis=1)

# Align columns (ensure same order/number as training)
for col in X_processed.columns:
    if col not in X_test_final.columns:
        X_test_final[col] = 0
X_test_final = X_test_final[X_processed.columns]

y_test_pred = model.predict(X_test_final)

Saving test.csv to test.csv


In [None]:
# Step 7. Submission
submission = pd.DataFrame({
    "ID": test["ID"] if "ID" in test.columns else range(len(y_test_pred)),
    "Energy Consumption": y_test_pred
})

submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,ID,Energy Consumption
0,0,2865.550031
1,1,4283.800222
2,2,5067.849921
3,3,4624.299166
4,4,4820.601901


In [None]:
from google.colab import files
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>