In [35]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import MinMaxScaler
import xgboost as xg

In [8]:
# Load data
url1 = "/kaggle/input/moistureminds22/user1_data.csv"
url2 = "/kaggle/input/moistureminds22/user2_data.csv"

df1 = pd.read_csv(url1)
df2 = pd.read_csv(url2)

In [9]:

# Handle missing values
df1 = df1.interpolate(method='linear', limit_direction='both')
df2 = df2.interpolate(method='linear', limit_direction='both')
df1.fillna(method='bfill', inplace=True)
df2.fillna(method='bfill', inplace=True)

df1.info()
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19341 entries, 0 to 19340
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ttime   19341 non-null  object 
 1   pm1     19341 non-null  float64
 2   pm2     19341 non-null  float64
 3   pm3     19341 non-null  float64
 4   am      19341 non-null  float64
 5   sm      19341 non-null  float64
 6   st      19341 non-null  float64
 7   lum     19341 non-null  float64
dtypes: float64(7), object(1)
memory usage: 1.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20166 entries, 0 to 20165
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ttime   20166 non-null  object 
 1   pm1     20166 non-null  float64
 2   pm2     20166 non-null  float64
 3   pm3     20166 non-null  float64
 4   am      20166 non-null  float64
 5   sm      20166 non-null  float64
 6   lum     20166 non-null  float64
 7   temp    20166 non-null 

In [11]:
# Convert the "ttime" column to datetime
df1["ttime"] = pd.to_datetime(df1["ttime"], errors='coerce')

# Convert the datetime values to Unix timestamp (number of seconds since January 1, 1970)
df1["ttime"] = df1["ttime"].apply(lambda x: x.timestamp())

# Convert the "ttime" column to float
df1["ttime"] = pd.to_numeric(df1["ttime"], errors='coerce').astype(float)

# Convert the "ttime" column to datetime
df2["ttime"] = pd.to_datetime(df2["ttime"], errors='coerce')

# Convert the datetime values to Unix timestamp (number of seconds since January 1, 1970)
df2["ttime"] = df2["ttime"].apply(lambda x: x.timestamp())

# Convert the "ttime" column to float
df2["ttime"] = pd.to_numeric(df2["ttime"], errors='coerce').astype(float)

In [19]:
df2['ttime']

0        1.658159e+09
1        1.658159e+09
2        1.658159e+09
3        1.658160e+09
4        1.658160e+09
             ...     
20161    1.678439e+09
20162    1.678440e+09
20163    1.678442e+09
20164    1.678444e+09
20165    1.678445e+09
Name: ttime, Length: 20166, dtype: float64

In [20]:
df1['ttime']

0        1.658159e+09
1        1.658159e+09
2        1.658160e+09
3        1.658160e+09
4        1.658160e+09
             ...     
19336    1.678438e+09
19337    1.678440e+09
19338    1.678441e+09
19339    1.678443e+09
19340    1.678446e+09
Name: ttime, Length: 19341, dtype: float64

In [12]:
# Normalize the data
scaler1 = StandardScaler()
scaler2 = StandardScaler()

#df1[df1.columns[1:]] = scaler1.fit_transform(df1[df1.columns[1:]])
#df2[df2.columns[1:]] = scaler2.fit_transform(df2[df2.columns[1:]])

scaler = StandardScaler(with_mean=True, with_std=True)
scaler.n_features_in_ = 9

In [21]:
# Merge the datasets
#df = pd.merge(df1, df2, on=['latitude', 'longitude', 'date'], how='inner')
df = pd.concat([df1, df2], axis=0)
df[df.columns[1:]] = scaler1.fit_transform(df[df.columns[1:]])
# scaler1.fit(X_train)
# X.shape[1]
# print("ffddggs")
df.head(10)

Unnamed: 0,ttime,pm1,pm2,pm3,am,sm,st,lum,temp,humd,pres
0,1658159000.0,1.931896,-2.019029,-2.115044,0.857986,1.168879,-1.0335,2.86584,,,
1,1658159000.0,2.464284,-2.019029,-2.115044,1.603569,1.128451,-1.024907,2.598431,,,
2,1658160000.0,2.987165,-2.019029,-2.115044,1.230777,1.168879,-1.016314,2.580808,,,
3,1658160000.0,3.519553,-2.019029,-2.115044,1.603569,1.168879,-1.007721,2.720621,,,
4,1658160000.0,-0.996237,2.085582,-1.927497,0.857986,1.168879,-0.999128,2.776547,,,
5,1658160000.0,-0.463849,2.085582,-1.927497,1.230777,1.168879,-0.981942,2.579398,,,
6,1658161000.0,0.068539,2.085582,-1.927497,0.112403,1.168879,-0.973349,2.320684,,,
7,1658161000.0,0.600927,2.085582,-1.927497,2.349152,1.168879,-0.964755,2.213768,,,
8,1658161000.0,1.133314,2.085582,-1.927497,0.112403,1.168879,-0.956162,1.810305,,,
9,1658162000.0,1.665702,2.085582,-1.927497,0.857986,1.128451,-0.947569,1.700334,,,


In [23]:
# Fill NaN values in the 'ttime' column with an empty string
df1['ttime'] = df1['ttime'].fillna('')

# Convert the 'ttime' column to string type
df1['ttime'] = df1['ttime'].astype(str)

# Loop over each value in the 'ttime' column
for value in df1['ttime']:
    # Parse the string into a datetime object
    dt_obj = datetime.fromtimestamp(float(value))
    
    # Extract the Unix timestamp
    ttimep = dt_obj.timestamp()
    
    # Print the timestamp
    #print(ttimep)
    
# Fill NaN values in the 'ttime' column with an empty string
df2['ttime'] = df2['ttime'].fillna('')

# Convert the 'ttime' column to string type
df2['ttime'] = df2['ttime'].astype(str)

# Loop over each value in the 'ttime' column
for value in df2['ttime']:
    # Parse the string into a datetime object
    dt_obj = datetime.fromtimestamp(float(value))

    # Extract the Unix timestamp
    ttimep = dt_obj.timestamp()
    #print(ttimep)

In [25]:
df['ttime'] = df['ttime'].apply(lambda x: pd.Timestamp(x).timestamp())
    
# Split the data into training and testing sets
#X = df.drop(['sm'], axis=1)
X = df.drop(['temp','pres'], axis=1)
X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
y = df['sm']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print("Before X creation:", df.shape)
X = pd.DataFrame(X, columns=df.columns[:-1])
print("After X creation:", X.shape)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



X_test = pd.DataFrame(X_test, columns=df.columns[:-1])
X_train = pd.DataFrame(X_train, columns=df.columns[:-1])
print("shitt")
print(X_test.columns)
print(X_train.columns)

print("Number of columns in X_train_scaled_df:", X_train_scaled.shape[1])
print("Number of columns in X_test_scaled_df:", X_test_scaled.shape[1])

# convert X_test_scaled into a dataframe
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)




# scale X_test_scaled_df using the same scaler used for X_train_scaled
X_test_scaled_df = scaler.transform(X_test_scaled_df)

# convert X_test_scaled_df back to a numpy array
X_test_scaled = X_test_scaled_df

#X_test[X_test.columns[1:]] = scaler2.transform(X_test[X_test.columns[1:]])


Before X creation: (39507, 11)
After X creation: (39507, 10)
Shape of X_train: (27654, 10)
Shape of X_test: (11853, 10)
shitt
Index(['ttime', 'pm1', 'pm2', 'pm3', 'am', 'sm', 'st', 'lum', 'temp', 'humd'], dtype='object')
Index(['ttime', 'pm1', 'pm2', 'pm3', 'am', 'sm', 'st', 'lum', 'temp', 'humd'], dtype='object')
Number of columns in X_train_scaled_df: 10
Number of columns in X_test_scaled_df: 10


  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [26]:

X_train = X_train.astype('float64')
X_test = X_test.astype('float64')
y_train = y_train.astype('float64')
y_test = y_test.astype('float64')


#dfghjklktrdxc cfgb 
# Check for missing values
X_train.isna().sum()

# Replace missing values with mean of column
X_train.fillna(X_train.mean(), inplace=True)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fill NaN values with mean of column
df1 = df1.fillna(df1.mean())
# Fill NaN values with mean of column
df2 = df2.fillna(df2.mean())

df1 = df1.astype('float64')
df2 = df2.astype('float64')

scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test.iloc[:, 1:])

X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns[1:])

X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns[1:])
df.dropna(inplace=True)
print(X_test_scaled_df.isna().sum())
X_test_scaled_df = X_test_scaled_df.fillna(X_test_scaled_df.mean())

print(X_test_scaled_df.isna().sum())

print(X_train.shape)
print(X_test.shape)

scaler_train = StandardScaler()
X_train_scaled = scaler_train.fit_transform(X_train.iloc[:, 1:])

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns[1:])

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns[1:])

pm1        0
pm2        0
pm3        0
am         0
sm         0
st         0
lum     6030
temp       0
humd    5823
dtype: int64
pm1     0
pm2     0
pm3     0
am      0
sm      0
st      0
lum     0
temp    0
humd    0
dtype: int64
(27654, 10)
(11853, 10)




In [41]:
import lightgbm as lb
from lightgbm import LGBMRegressor

In [45]:


# Linear regression model
lr = LinearRegression()
lr_model=lr.fit(X_train_scaled_df, y_train)
lr_pred = lr.predict(X_test_scaled_df)

# Decision tree regressor model
dt = DecisionTreeRegressor(random_state=0)
dt_model=dt.fit(X_train_scaled_df, y_train)
dt_pred = dt.predict(X_test_scaled_df)

# Random forest regressor model
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf_model=rf.fit(X_train_scaled_df, y_train)
rf_pred = rf.predict(X_test_scaled_df)

xgb_r = xg.XGBRegressor(objective ='reg:linear',
                  n_estimators = 100, seed = 123)
xgb_r.fit(X_train_scaled_df, y_train)
x_pred = xgb_r.predict(X_test_scaled_df)

lbgm = LGBMRegressor(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=7)
lbgm.fit(X_train_scaled_df, y_train)
lb_pred = lbgm.predict(X_test_scaled_df)



In [46]:

# Ensemble model
ensemble_pred = (lr_pred + dt_pred + rf_pred +x_pred + lb_pred) / 5

# Calculate and print the performance metrics
ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_pred))
ensemble_r2 = r2_score(y_test, ensemble_pred)

print("Ensemble Model Performance Metrics:")
print("RMSE:", ensemble_rmse)
print("R-squared:", ensemble_r2)

# ,7813.0,,15001.0
# 'sm':[2.65]
lr_pred = lr_model.predict(X_test_scaled_df)
dt_pred = dt_model.predict(X_test_scaled_df)
rf_pred = rf_model.predict(X_test_scaled_df)
df = pd.DataFrame({'ttime': [1658193598.0], 'pm1': [3.63], 'pm2': [0.0], 'pm3':[0.0], 'am':[7463.0], 'temp':[23.4], 'lum':[2.11],'humd':[90.57], 'pres':[92849.25]})
predictions = (lr_model.predict(df)+dt_model.predict(df)+rf_model.predict(df) + xgb_r.predict(df)+lbgm.predict(df))/5
print("Predicted soil moisture:", predictions)

Ensemble Model Performance Metrics:
RMSE: 0.007414493688587446
R-squared: 0.9999450891840868
Predicted soil moisture: [5.64629967]


Feature names unseen at fit time:
- pres
- ttime
Feature names seen at fit time, yet now missing:
- sm
- st

Feature names unseen at fit time:
- pres
- ttime
Feature names seen at fit time, yet now missing:
- sm
- st

Feature names unseen at fit time:
- pres
- ttime
Feature names seen at fit time, yet now missing:
- sm
- st

