# Linear Regression with Time Series Data

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import pytz           #pythone time zone liberary
from pymongo import MongoClient
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Prepare Data
## Import

<b>Task 3.2.1: Complete to the create a client to connect to the MongoDB server, assign the "air-quality" database to db, and assign the "nairobi" connection to nairobi.</b>

In [None]:
client = MongoClient(host="localhost",port=27017)
db = client["air-quality"]
nairobi = db["nairobi"]

<b>Task 3.2.2: Complete the wrangle function below so that the results from the database query are read into the DataFrame df. Be sure that the index of df is the "timestamp" from the results.</b>

In [None]:
def wrangle(collection):
    results = collection.find(
        {"metadata.site": 29, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},
    )

    df = pd.DataFrame(results).set_index("timestamp")
    
    #localize trimezone
    df.index=df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")
    
    #remove outlier
    df = df[df["P2"]<500]
    
    #resampling to 1H window and imputing missing values by ffill method
    df = df["P2"].resample("1H").mean().fillna(method="ffill").to_frame()
    
    #add lag features
    df["P2.L1"] = df["P2"].shift(1)
    df.dropna(inplace=True)
    
    return df

<b>Task 3.2.3: Use your wrangle function to read the data from the nairobi collection into the DataFrame df.</b>

In [2]:
df = wrangle(nairobi)
print(df.shape)
df.head()

NameError: name 'wrangle' is not defined

<b>Task 3.2.4: Add to your wrangle function so that the DatetimeIndex for df is localized to the correct timezone, "Africa/Nairobi". Don't forget to re-run all the cells above after you change the function.</b>

the time save in mongodb in UTC (universal coded Timmer) we have to change it into nairobi time zone

In [None]:
df.index.tz_localize("UTC")[:5]     #it just convert time into UTC but we need to convert in Nairobi timezone

In [None]:
df.index.tz_localize("UTC").tz_convert("Africa/Nairobi")[:5]  
# now it is converted into Nairobi timezone which is three hour ago 

# Explore

<b>Task 3.2.5: Create a boxplot of the "P2" readings in df.</b>

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
df["P2"].plot(kind="box",vert=False,title="Distribution of PM2.5 Readings",ax=ax)

<b>Task 3.2.6: Add to your wrangle function so that all "P2" readings above 500 are dropped from the dataset. Don't forget to re-run all the cells above after you change the function.</b>

In [None]:
df = df[df["P2"] < 500]  # add to wrangle()

<b>Task 3.2.7: Create a time series plot of the "P2" readings in df.</b>

In [None]:
fig,ax = plt.subplots(figsize=(15,6))
df["P2"].plot(xlabel="Time",ylabel="PM2.5",title="PM2.5 time series",ax=ax)
#when ploting time series data we don't know to use use kind of plot then we just use "plot"

<b>Task 3.2.8: Add to your wrangle function to resample df to provide the mean "P2" reading for each hour. Use a forward fill to impute any missing values. Don't forget to re-run all the cells above after you change the function.</b>

### Question at what interval do you want to predict PM2.5. at every sec or min or hour, but we want in every hour

In [None]:
df["P2"].resample("1H").mean().fillna(method="ffill").to_frame().isnull().sum()

<b>Task 3.2.9: Plot the rolling average of the "P2" readings in df. Use a window size of 168 (the number of hours in a week).</b>

# Rolling Averages
A rolling average is the mean value of multiple subsets of numbers in a dataset. For example, I might have data relating to the daily income for a shop I own, and as long as the shop stays open, I can calculate a rolling average. On Friday, I might calculate the average income from Monday-Thursday. The next Monday, I might calculate the average income from Tuesday-Friday, and the next day, I might calculate the average income from Wednesday to Monday, and so on. These averages roll, giving me a sense for how the data is changing in relation to any kind of static construct. In this case, and in many data science applications, that construct is time. Calculating rolling averages is helpful for making accurate forecasts about the ways data will change in the future.

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
df["P2"].rolling(168).mean().plot(ax=ax,ylabel="PM2.5",title="weekly rolling object");

<b>Task 3.2.10: Add to your wrangle function to create a column called "P2.L1" that contains the mean"P2" reading from the previous hour. Since this new feature will create NaN values in your DataFrame, be sure to also drop null rows from df.</b>

In [None]:
#add lag features
df["P2.L1"] = df["P2"].shift(1)
df.dropna(inplace=True)

<b>Task 3.2.11: Create a correlation matrix for df.</b>

In [None]:
df.corr()

<b>Task 3.2.12: Create a scatter plot that shows PM 2.5 mean reading for each our as a function of the mean reading from the previous hour. In other words, "P2.L1" should be on the x-axis, and "P2" should be on the y-axis. Don't forget to label your axes!</b>

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(x=df["P2.L1"], y=df["P2"])
ax.plot([0,120], [0,120], linestyle="--",color="orange")
plt.xlabel("P2.L1")
plt.ylabel("P2")
plt.title("PM2.5 Autocorrelation")

# Split
## vertical split

<b>Task 3.2.13: Split the DataFrame df into the feature matrix X and the target vector y. Your target is "P2".</b>

In [None]:
target = "P2"
y = df[target]
X = df.drop(columns=target)

<b>Task 3.2.14: Split X and y into training and test sets. The first 80% of the data should be in your training set. The remaining 20% should be in the test set.</b>

In [None]:
cutoff = int(len(X) * 0.8)    #80 per of data for training

X_train, y_train = X.iloc[:cutoff], y.iloc[:cutoff]
X_test, y_test = X.iloc[cutoff:], y.iloc[cutoff:]

In [None]:
len(X_train) + len(X_test) == len(X)

# Build Model
## Baseline

<b>Task 3.2.15: Calculate the baseline mean absolute error for your model.</b>

In [None]:
y_mean = y_train.mean()

In [None]:
y_pred_baseline = [y_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train,y_pred_baseline)

print("Mean P2 Reading:", round(y_train.mean(), 2))
print("Baseline MAE:", round(mae_baseline, 2))

# Iterate
<b>Task 3.2.16: Instantiate a LinearRegression model named model, and fit it to your training data.</b>

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train,y_train)

# Evaluate

<b>Task 3.2.17: Calculate the training and test mean absolute error for your model.</b>

In [None]:
training_mae = mean_absolute_error(y_train, model.predict(X_train))
test_mae = mean_absolute_error(y_test, model.predict(X_test))
print("Training MAE:", round(training_mae, 2))
print("Test MAE:", round(test_mae, 2))

# Communicate Results

<b>Task 3.2.18: Extract the intercept and coefficient from your model.</b>

In [None]:
intercept =model.intercept_
coefficient = model.coef_

print(f"P2 = {intercept} + ({coefficient} * P2.L1)")

<b>Task 3.2.19: Create a DataFrame df_pred_test that has two columns: "y_test" and "y_pred". The first should contain the true values for your test set, and the second should contain your model's predictions. Be sure the index of df_pred_test matches the index of y_test.</b>

In [None]:
df_pred_test = pd.DataFrame(
    {
        "y_test": y_test,
        "y_pred": model.predict(X_test)
    }
)
df_pred_test.head()

<b>Task 3.2.20: Create a time series line plot for the values in test_predictions using plotly express. Be sure that the y-axis is properly labeled as "P2".</b>

In [None]:
fig = px.line(df_pred_test, labels = {"value":"P2"})
fig.show()