In [None]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
get_ipython().ast_node_interactivity = 'all'

Mounted at /content/drive


In [None]:
# Specify the path to your CSV file
file_path = '/content/drive/My Drive/household_power_consumption.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows
df.head()


Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0,1,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0,1,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0,2,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0,1,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0,1,17.0


In [None]:
print(df.columns)

Index(['Date ', 'Time', 'Global_active_power', 'Global_reactive_power',
       'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
       'Sub_metering_3'],
      dtype='object')


In [None]:
df.columns = df.columns.str.strip()
df.columns

Index(['Date', 'Time', 'Global_active_power', 'Global_reactive_power',
       'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
       'Sub_metering_3'],
      dtype='object')

In [None]:
# Step 3: Rename the columns
df['ds'] = df['Date'] + ' ' + df['Time']
df = df.drop(columns=['Date', 'Time'])
df = df.rename(columns={'Global_active_power': 'y'})

# Display the modified DataFrame
print(df.head())

# Step 4: Save the modified DataFrame to a new CSV file
df.to_csv('modified_data.csv', index=False)


       y Global_reactive_power Voltage Global_intensity Sub_metering_1  \
0  4.216                 0.418  234.84             18.4              0   
1   5.36                 0.436  233.63               23              0   
2  5.374                 0.498  233.29               23              0   
3  5.388                 0.502  233.74               23              0   
4  3.666                 0.528  235.68             15.8              0   

  Sub_metering_2  Sub_metering_3                   ds  
0              1            17.0  16/12/2006 17:24:00  
1              1            16.0  16/12/2006 17:25:00  
2              2            17.0  16/12/2006 17:26:00  
3              1            17.0  16/12/2006 17:27:00  
4              1            17.0  16/12/2006 17:28:00  


In [None]:
# Install the `prophet` package
!pip install prophet

# Verify the installation
import importlib.util
package_name = 'prophet'
spec = importlib.util.find_spec(package_name)
if spec is None:
    print(f"{package_name} is not installed")
else:
    print(f"{package_name} is installed")


prophet is installed


In [None]:
# Verify the loaded data
print("Loaded Data:")
print(df.head())
print("\nData Types:")
print(df.dtypes)

# Step 4: Convert 'ds' column to datetime
df['ds'] = pd.to_datetime(df['ds'], format='%d/%m/%Y %H:%M:%S')

# Verify conversion to datetime
print("\nData after converting 'ds' to datetime:")
print(df.head())
print("\nData Types after conversion:")
print(df.dtypes)

# Step 5: Resample data to daily sum
df_daily = df.resample('D', on='ds').sum().reset_index()

# Check the columns and data after resampling
print("\nData after resampling:")
print(df_daily.head())
print("\nColumns in resampled DataFrame:")
print(df_daily.columns)

# Verify the 'Global_active_power' column
print("\nData in 'y' after resampling:")
print(df_daily['y'])

# Step 6: Fill missing values
df_daily['y'].fillna(method='ffill', inplace=True)

# Verify if there are any missing values after filling
print("\nCheck for NaN values in 'y' after filling:")
print(df_daily['y'].isna().sum())

# Step 7: Prepare data for Prophet
df_prophet = df_daily[['ds', 'y']].rename(columns={'ds': 'ds', 'Global_active_power': 'y'})

# Convert 'y' column to numeric (if not already numeric)
df_prophet['y'] = pd.to_numeric(df_prophet['y'], errors='coerce')

# Drop rows with NaN values in 'y' (if any)
df_prophet.dropna(subset=['y'], inplace=True)

# Verify the DataFrame after conversion
print("\nData prepared for Prophet after conversion:")
print(df_prophet.head())

# Step 8: Split the data into training and test sets
if len(df_prophet) > 300:
    train = df_prophet[:-300]
    test = df_prophet[-300:]
else:
    raise ValueError("Not enough data to split into train and test sets")

# Step 9: Initialize and fit the Prophet model
model = Prophet()
model.fit(train)

# Step 10: Make future predictions
future = model.make_future_dataframe(periods=300)
forecast = model.predict(future)

# Step 11: Plot forecast
fig1 = model.plot(forecast)
plt.title("Forecast of Global Active Power using Prophet")
plt.xlabel("Date")
plt.ylabel("Global Active Power (daily sum)")
plt.show()

# Step 12: Plot components
fig2 = model.plot_components(forecast)
plt.show()

# Step 13: Evaluate performance (Optional)
forecast_test = forecast.set_index('ds').join(test.set_index('ds'), lsuffix='_pred')
mse = ((forecast_test['yhat'] - forecast_test['y']) ** 2).mean()
print(f"Mean Squared Error (MSE) on Test Set: {mse}")




Loaded Data:
       y Global_reactive_power Voltage Global_intensity Sub_metering_1  \
0  4.216                 0.418  234.84             18.4              0   
1   5.36                 0.436  233.63               23              0   
2  5.374                 0.498  233.29               23              0   
3  5.388                 0.502  233.74               23              0   
4  3.666                 0.528  235.68             15.8              0   

  Sub_metering_2  Sub_metering_3                  ds  
0              1            17.0 2006-12-16 17:24:00  
1              1            16.0 2006-12-16 17:25:00  
2              2            17.0 2006-12-16 17:26:00  
3              1            17.0 2006-12-16 17:27:00  
4              1            17.0 2006-12-16 17:28:00  

Data Types:
y                                object
Global_reactive_power            object
Voltage                          object
Global_intensity                 object
Sub_metering_1                   object

ValueError: Not enough data to split into train and test sets