In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf

In [None]:
df = pd.read_csv("RRCA_baseflow.csv")
# display(df)
df['Date'] = df['Date'] - 693963
df['Date_YMD'] = pd.to_datetime(df['Date'], origin='1900-01-01', unit='D')
df['Month'] = df['Date_YMD'].dt.month
display(df)
# df.to_csv('RRCA_baseflow_YMD.csv')


In [None]:
print(df.isnull().sum())

In [None]:
unique_segment_ids = df['Segment_id'].unique()
print(unique_segment_ids)

In [None]:
df.shape

In [None]:
plt.figure(figsize=(16, 12))

predictors = ['Evapotranspiration', 'Precipitation', 'Irrigation_pumping']
for i, col in enumerate(predictors, 1):
    plt.subplot(2, 3, i)
    sns.scatterplot(data=df, x=col, y='Observed')
    plt.title(f'{col} vs Observed Baseflow')
plt.tight_layout()
plt.show()

predictors_with_observed = ['Evapotranspiration', 'Precipitation', 'Irrigation_pumping', 'Observed']
plt.figure(figsize=(16, 10))
for i, col in enumerate(predictors_with_observed, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

plt.figure(figsize=(30, 8))
for i, col in enumerate(predictors_with_observed, 1):
    plt.subplot(1, 4, i)
    sns.lineplot(data=df, x='Month', y=col, estimator='mean', errorbar=None, color='blue')
    plt.title(f'Monthly Average of {col}')
    plt.xlabel('Month')
    plt.ylabel(col)
    plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# specific_segment_id = 144 
# df_specific_segment = df[df['Segment_id'] == specific_segment_id]
# print(f"Data for Segment id {specific_segment_id}:")
# display(df_specific_segment.head())
# # df_specific_segment.to_csv(f'{specific_segment_id}segment.csv')

# plt.figure(figsize=(16, 12))
# predictors = ['Evapotranspiration', 'Precipitation', 'Irrigation_pumping']

# for i, col in enumerate(predictors, 1):
#     plt.subplot(2, 3, i)
#     sns.scatterplot(data=df_specific_segment, x=col, y='Observed')
#     plt.title(f'{col} vs Observed Baseflow')
# plt.tight_layout()
# plt.show()

# plt.figure(figsize=(30, 8))
# predictors_with_observed = ['Evapotranspiration', 'Precipitation', 'Irrigation_pumping','Observed']
# for i, col in enumerate(predictors_with_observed, 1):
#     plt.subplot(1, 4, i)
#     sns.scatterplot(data=df_specific_segment, x='Date_YMD', y=col, label=col, color='blue', alpha=0.6)
#     plt.title(f'{col} Over Time')
#     plt.xlabel('Date')
#     plt.ylabel(col)
#     plt.xticks(rotation=45)
#     plt.grid(True)
# plt.tight_layout()
# plt.show()

# plt.figure(figsize=(16, 10))
# for i, col in enumerate(predictors_with_observed, 1):
#     plt.subplot(2, 3, i)
#     sns.histplot(df_specific_segment[col], kde=True)
#     plt.title(f'Distribution of {col}')

# plt.tight_layout()
# plt.show()

In [None]:
# specific_segment_id = 171 
# df_specific_segment = df[df['Segment_id'] == specific_segment_id]
# print(f"Data for Segment id {specific_segment_id}:")
# display(df_specific_segment.head())
# # df_specific_segment.to_csv(f'{specific_segment_id}segment.csv')

# predictors = ['Evapotranspiration', 'Precipitation', 'Irrigation_pumping']
# target = 'Observed'

# # plt.figure(figsize=(16, 12))
# # for i, col in enumerate(predictors, 1):
# #     plt.subplot(2, 3, i)
# #     sns.scatterplot(data=df_specific_segment, x=col, y='Observed')
# #     plt.title(f'{col} vs Observed Baseflow')
# # plt.tight_layout()
# # plt.show()

# plt.figure(figsize=(30, 8))
# predictors_with_observed = ['Evapotranspiration', 'Precipitation', 'Irrigation_pumping','Observed']

# # for i, col in enumerate(predictors_with_observed, 1):
# #     plt.subplot(1, 4, i)
# #     sns.scatterplot(data=df_specific_segment, x='Date_YMD', y=col, label=col, color='blue', alpha=0.6)
# #     plt.title(f'{col} Over Time')
# #     plt.xlabel('Date')
# #     plt.ylabel(col)
# #     plt.xticks(rotation=45)
# #     plt.grid(True)
# # plt.tight_layout()
# # plt.show()

# plt.figure(figsize=(16, 10))
# for i, col in enumerate(predictors_with_observed, 1):
#     plt.subplot(2, 3, i)
#     sns.histplot(df_specific_segment[col], kde=True)
#     plt.title(f'Distribution of {col}')
# plt.tight_layout()
# plt.show()

# plt.figure(figsize=(30, 8))
# for i, col in enumerate(predictors_with_observed, 1):
#     plt.subplot(1, 4, i)
#     sns.lineplot(data=df_specific_segment, x='Month', y=col, estimator='mean', errorbar=None, color='blue')
#     plt.title(f'Monthly Average of {col}')
#     plt.xlabel('Month')
#     plt.ylabel(col)
#     plt.grid(True)
# plt.tight_layout()
# plt.show()

# x = df_specific_segment[predictors].values
# y = df_specific_segment[target].values

# lm = LinearRegression()
# lm.fit(x, y)
# print(lm.intercept_)
# print(lm.coef_)

# formula = 'Observed ~ Evapotranspiration + Precipitation + Irrigation_pumping'
# lms = smf.ols(formula=formula, data=df_specific_segment).fit()

# print("Coefficients")
# display(lms.params)

# print("Confidence Intervals")
# display(lms.conf_int())

# print("Model Summary")
# print(lms.summary())

# print("P-values")
# display(lms.pvalues)


#### This looks at data and distribution over time per segment

In [9]:
for segment in unique_segment_ids:
    specific_segment_id = segment
    df_specific_segment = df[df['Segment_id'] == specific_segment_id]
    print(f"Data for Segment id {specific_segment_id}:")
    display(df_specific_segment.head())
    # df_specific_segment.to_csv(f'{specific_segment_id}segment.csv')
    predictors = ['Evapotranspiration', 'Precipitation', 'Irrigation_pumping']
    target = ['Observed']

    plt.figure(figsize=(16, 12))
    for i, col in enumerate(predictors, 1):
        plt.subplot(2, 3, i)
        sns.scatterplot(data=df_specific_segment, x=col, y='Observed')
        plt.title(f'{col} vs Observed Baseflow')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(30, 8))
    predictors_with_observed = ['Evapotranspiration', 'Precipitation', 'Irrigation_pumping','Observed']

    for i, col in enumerate(predictors_with_observed, 1):
        plt.subplot(1, 4, i)
        sns.scatterplot(data=df_specific_segment, x='Date_YMD', y=col, label=col, color='blue', alpha=0.6)
        plt.title(f'{col} Over Time')
        plt.xlabel('Date')
        plt.ylabel(col)
        plt.xticks(rotation=45)
        plt.grid(True)
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(16, 10))
    for i, col in enumerate(predictors_with_observed, 1):
        plt.subplot(2, 3, i)
        sns.histplot(df_specific_segment[col], kde=True)
        plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(30, 8))
    for i, col in enumerate(predictors_with_observed, 1):
        plt.subplot(1, 4, i)
        sns.lineplot(data=df_specific_segment, x='Month', y=col, estimator='mean', color='blue')
        plt.title(f'Monthly Average of {col}')
        plt.xlabel('Month')
        plt.ylabel(col)
        plt.grid(True)
    plt.tight_layout()
    plt.show()
    
    plt.figure(figsize=(30, 8))
    for i, col in enumerate(predictors_with_observed, 1):
        plt.subplot(1, 4, i)
        sns.scatterplot(data=df_specific_segment, x='Month', y=col, color='blue', alpha=0.6)
        plt.title(f'{col} by Month')
        plt.xlabel('Month')
        plt.ylabel(col)
        plt.grid(True)
    plt.tight_layout()
    plt.show()

    x = df_specific_segment[predictors].values
    y = df_specific_segment[target].values.ravel()

    print("Sklearn")
    lm = LinearRegression()
    lm.fit(x, y)
    print(lm.intercept_)
    print(lm.coef_)


    print("Statsmodel")
    formula = 'Observed ~ Evapotranspiration + Precipitation + Irrigation_pumping'
    lms = smf.ols(formula=formula, data=df_specific_segment).fit()

    print("Coefficients")
    display(lms.params)

    print("Confidence Intervals")
    display(lms.conf_int())

    print("Model Summary")
    print(lms.summary())

    print("P-values")
    display(lms.pvalues)