### Combine tables into one table

In [37]:
import pandas as pd


df1 = pd.read_csv("data manipulation/Plant_2_Generation_Data.csv")
df2 = pd.read_csv("data manipulation/Plant_2_Weather_Sensor_Data.csv")

merged_plant2 = pd.merge(df1, df2, on="DATE_TIME", how="inner")

merged_plant2.to_csv("data manipulation/merged_plant2.csv", index=False)

df = pd.read_csv("data manipulation/merged_plant2.csv")
print(df.head())
print(df.info())

             DATE_TIME  PLANT_ID_x     SOURCE_KEY_x  DC_POWER  AC_POWER  \
0  2020-05-15 00:00:00     4136001  4UPUqMRk7TRMgml       0.0       0.0   
1  2020-05-15 00:00:00     4136001  81aHJ1q11NBPMrL       0.0       0.0   
2  2020-05-15 00:00:00     4136001  9kRcWv60rDACzjR       0.0       0.0   
3  2020-05-15 00:00:00     4136001  Et9kgGMDl729KT4       0.0       0.0   
4  2020-05-15 00:00:00     4136001  IQ2d7wF4YD8zU1Q       0.0       0.0   

   DAILY_YIELD   TOTAL_YIELD  PLANT_ID_y     SOURCE_KEY_y  \
0  9425.000000  2.429011e+06     4136001  iq8k7ZNt4Mwm3w0   
1     0.000000  1.215279e+09     4136001  iq8k7ZNt4Mwm3w0   
2  3075.333333  2.247720e+09     4136001  iq8k7ZNt4Mwm3w0   
3   269.933333  1.704250e+06     4136001  iq8k7ZNt4Mwm3w0   
4  3177.000000  1.994153e+07     4136001  iq8k7ZNt4Mwm3w0   

   AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  IRRADIATION  
0            27.004764           25.060789          0.0  
1            27.004764           25.060789          0.0  
2       

### Remove rows where IRRADIATION = 0.0

In [38]:
target_column = "IRRADIATION"

df_cleaned = df[df[target_column] != 0.0]

print(df_cleaned.head())
print("Data in df:", len(df))
print("Data in cleaned_df:", len(df_cleaned))

               DATE_TIME  PLANT_ID_x     SOURCE_KEY_x  DC_POWER  AC_POWER  \
506  2020-05-15 05:45:00     4136001  4UPUqMRk7TRMgml       0.0       0.0   
507  2020-05-15 05:45:00     4136001  81aHJ1q11NBPMrL       0.0       0.0   
508  2020-05-15 05:45:00     4136001  9kRcWv60rDACzjR       0.0       0.0   
509  2020-05-15 05:45:00     4136001  Et9kgGMDl729KT4       0.0       0.0   
510  2020-05-15 05:45:00     4136001  IQ2d7wF4YD8zU1Q       0.0       0.0   

     DAILY_YIELD   TOTAL_YIELD  PLANT_ID_y     SOURCE_KEY_y  \
506          0.0  2.429011e+06     4136001  iq8k7ZNt4Mwm3w0   
507          0.0  1.215279e+09     4136001  iq8k7ZNt4Mwm3w0   
508          0.0  2.247720e+09     4136001  iq8k7ZNt4Mwm3w0   
509          0.0  1.704250e+06     4136001  iq8k7ZNt4Mwm3w0   
510          0.0  1.994153e+07     4136001  iq8k7ZNt4Mwm3w0   

     AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  IRRADIATION  
506            24.741274           23.786662     0.002838  
507            24.741274           23.

### Change to intervals of 30 min instead of 15 min

In [39]:
agg_rules = {
    # Keep IDs constant
    'PLANT_ID_x': 'first',
    'SOURCE_KEY_x': 'first',
    'PLANT_ID_y': 'first',
    'SOURCE_KEY_y': 'first',

    # Find mean of each column
    'DC_POWER': 'mean',
    'AC_POWER': 'mean',
    'AMBIENT_TEMPERATURE': 'mean',
    'MODULE_TEMPERATURE': 'mean',
    'IRRADIATION': 'mean',

    # Get total (last)
    'DAILY_YIELD': 'last',
    'TOTAL_YIELD': 'last'
}

# Convert to datetime
df_cleaned['DATE_TIME'] = pd.to_datetime(df_cleaned['DATE_TIME'])
# Use as main index for resampling
df_cleaned = df_cleaned.set_index('DATE_TIME')

# Apply the resampling and the aggregation rules
df_30min = df_cleaned.resample('30Min').agg(agg_rules)

print(df_30min.head())
print("Data in df_30min:", len(df_30min))

                     PLANT_ID_x     SOURCE_KEY_x  PLANT_ID_y     SOURCE_KEY_y  \
DATE_TIME                                                                       
2020-05-15 05:30:00   4136001.0  4UPUqMRk7TRMgml   4136001.0  iq8k7ZNt4Mwm3w0   
2020-05-15 06:00:00   4136001.0  4UPUqMRk7TRMgml   4136001.0  iq8k7ZNt4Mwm3w0   
2020-05-15 06:30:00   4136001.0  4UPUqMRk7TRMgml   4136001.0  iq8k7ZNt4Mwm3w0   
2020-05-15 07:00:00   4136001.0  4UPUqMRk7TRMgml   4136001.0  iq8k7ZNt4Mwm3w0   
2020-05-15 07:30:00   4136001.0  4UPUqMRk7TRMgml   4136001.0  iq8k7ZNt4Mwm3w0   

                       DC_POWER    AC_POWER  AMBIENT_TEMPERATURE  \
DATE_TIME                                                          
2020-05-15 05:30:00    0.000000    0.000000            24.741274   
2020-05-15 06:00:00   20.890292   20.168431            24.753349   
2020-05-15 06:30:00  107.615974  104.936699            24.917284   
2020-05-15 07:00:00  343.474848  337.473333            25.875010   
2020-05-15 07:30:00  503

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['DATE_TIME'] = pd.to_datetime(df_cleaned['DATE_TIME'])
