In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sqlalchemy import create_engine
from urllib.parse import quote_plus

In [33]:
# SQLAlchemy connectable
# DEFAULT engine = create_engine("postgresql://USERNAME:%s@HOST/mydatabase" % quote_plus("Password"))
engine = create_engine("postgresql://postgres:%s@localhost/project4" % quote_plus("*******"))
engine.connect()

<sqlalchemy.engine.base.Connection at 0x26a66c9b8b0>

In [34]:
# Read in SQL files
regional_df = pd.read_sql_table('regional',engine)
regional_df.head()

Unnamed: 0,doeid,regionc,division,state_postal,ba_climate,totalbtu,totaldol
0,100001,WEST,Mountain South,NM,Mixed-Dry,144647.71,2656.89
1,100002,SOUTH,West South Central,AR,Mixed-Humid,28034.61,975.0
2,100003,WEST,Mountain South,NM,Mixed-Dry,30749.71,522.65
3,100004,SOUTH,South Atlantic,SC,Mixed-Humid,86765.19,2061.77
4,100005,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid,59126.93,1463.04


In [35]:
# Create DataFrame with regional information
regional_df = regional_df.drop(columns=['doeid', 'totaldol'])
regional_df.head()

Unnamed: 0,regionc,division,state_postal,ba_climate,totalbtu
0,WEST,Mountain South,NM,Mixed-Dry,144647.71
1,SOUTH,West South Central,AR,Mixed-Humid,28034.61
2,WEST,Mountain South,NM,Mixed-Dry,30749.71
3,SOUTH,South Atlantic,SC,Mixed-Humid,86765.19
4,NORTHEAST,Middle Atlantic,NJ,Mixed-Humid,59126.93


In [36]:
#Checking nulls
regional_df.isnull().sum()

regionc         0
division        0
state_postal    0
ba_climate      0
totalbtu        0
dtype: int64

In [37]:
# Define a function to perform binning on TOTALBTU column
def bin_total_btu(total_btu):
    if total_btu < 55000:
        return 'Low'
    elif total_btu >= 55000 and total_btu < 95000:
        return 'Medium'
    else:
        return 'High'

In [39]:
# Apply binning function to create a new column 'BTU_Bin'
regional_df['BTU_Bin'] = regional_df['totalbtu'].apply(bin_total_btu)

In [40]:
# Check the balance of bins
bin_counts = regional_df['BTU_Bin'].value_counts()

In [41]:
# Display the counts of samples in each bin
print("Counts of samples in each bin:")
print(bin_counts)

Counts of samples in each bin:
BTU_Bin
High      6240
Low       6194
Medium    6062
Name: count, dtype: int64


In [43]:
# Perform one-hot encoding on the 'BTU_Bin' column
X_encoded = pd.get_dummies(regional_df.drop(['totalbtu'], axis=1), drop_first=True)
y = regional_df['totalbtu']

In [44]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.05, random_state=42)

In [45]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [46]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test_scaled)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 567474693.1939898
R-squared (R2): 0.7529492375445945


In [47]:
print(f"The predicted value is: {y_pred[0]}")

The predicted value is: 33859.61484126168
