In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
train = pd.read_csv('dataset/train1000.csv')

In [3]:
building_meta = pd.read_csv('dataset/building_metadata1000.csv')

In [4]:
weather_train = pd.read_csv('dataset/weather_train1000.csv')

In [5]:
weather_test = pd.read_csv('dataset/weather_test1000.csv')

In [6]:
test = pd.read_csv('dataset/test1000.csv')

In [7]:
train = train.merge(building_meta, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')

In [8]:
train = train.drop(columns=['precip_depth_1_hr', 'wind_direction', 'wind_speed', 'timestamp'])
train = train.loc[train['meter'] == 0]

# Fill missing values in numeric columns with the mean
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    train[col] = train[col].fillna(train[col].mean())


In [9]:
df = pd.get_dummies(train)
df.replace({True: 1, False: 0}, inplace=True)
print(df)

     building_id  meter  meter_reading  site_id  square_feet  year_built  \
0              0      0           0.00        0         7432      2008.0   
1              1      0           0.00        0         2720      2004.0   
2              2      0           0.00        0         5376      1991.0   
3              3      0           0.00        0        23685      2002.0   
4              4      0           0.00        0       116607      1975.0   
..           ...    ...            ...      ...          ...         ...   
986          799      0           0.00        7       527431      1976.0   
989          800      0         113.96        7        64583      1955.0   
992          801      0        3987.86        7       484376      1952.0   
995          802      0           0.00        7       290625      1995.0   
998          803      0        4791.84        7       182986      1962.0   

     floor_count  air_temperature  cloud_coverage  dew_temperature  ...  \
0       4.83

  df.replace({True: 1, False: 0}, inplace=True)


In [18]:
# Split the data into input features (X) and the target variable (y)
y = df["meter_reading"]
X = df.drop(columns=["meter_reading"])

print(X.columns)

Index(['building_id', 'meter', 'site_id', 'square_feet', 'year_built',
       'floor_count', 'air_temperature', 'cloud_coverage', 'dew_temperature',
       'sea_level_pressure', 'primary_use_Education',
       'primary_use_Entertainment/public assembly',
       'primary_use_Food sales and service', 'primary_use_Healthcare',
       'primary_use_Lodging/residential',
       'primary_use_Manufacturing/industrial', 'primary_use_Office',
       'primary_use_Other', 'primary_use_Parking',
       'primary_use_Public services', 'primary_use_Religious worship',
       'primary_use_Retail', 'primary_use_Technology/science',
       'primary_use_Utility', 'primary_use_Warehouse/storage'],
      dtype='object')


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

In [13]:
# Predict the target variable for the test data
y_pred = model.predict(X_test)

# Evaluate the performance of the model
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 61131.2673967468
R-squared: 0.3852612680659119


In [16]:
import pickle
model_out = open("model.pkl","wb")
pickle.dump(model, model_out)
model_out.close()