In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("TARP.csv")

In [3]:
print(df.head())
print(df.columns)
print(f"Length of df is {len(df)}")

   Soil Moisture  Temperature   Soil Humidity  Time  Air temperature (C)  \
0             54           22              70    21                19.52   
1             12           20              40   104                19.49   
2             34           26              35    62                19.47   
3              7           44              44    93                19.54   
4             50           38              23    92                19.61   

   Wind speed (Km/h)  Air humidity (%)  Wind gust (Km/h)  Pressure (KPa)  \
0               2.13             55.04              6.30          101.50   
1               2.01             55.17             10.46          101.50   
2               1.90             55.30             14.63          101.51   
3               2.28             54.20             16.08          101.51   
4               2.66             53.09             17.52          101.51   

         ph    rainfall     N     P     K Status  
0  6.502985  202.935536  90.0  42.0

In [4]:
df.describe()

Unnamed: 0,Soil Moisture,Temperature,Soil Humidity,Time,Air temperature (C),Wind speed (Km/h),Air humidity (%),Wind gust (Km/h),Pressure (KPa),ph,rainfall,N,P,K
count,100000.0,100000.0,100000.0,100000.0,23995.0,23995.0,23995.0,23995.0,23995.0,2200.0,2200.0,2200.0,2200.0,2200.0
mean,45.48407,22.5361,45.01679,55.25365,24.26378,9.896898,58.521052,41.744905,101.131418,6.46948,103.463655,50.551818,53.362727,48.149091
std,25.993998,13.251352,14.726676,32.093033,6.756751,4.325666,30.073448,24.168987,0.218448,0.773938,54.958389,36.917334,32.985883,50.647931
min,1.0,0.0,20.0,0.0,11.22,0.0,0.59,0.0,100.5,3.504752,20.211267,0.0,5.0,5.0
25%,23.0,11.0,32.0,28.0,18.58,6.83,33.98,21.82,100.98,5.971693,64.551686,21.0,28.0,20.0
50%,45.0,23.0,45.0,55.0,22.15,9.53,61.31,37.24,101.12,6.425045,94.867624,37.0,51.0,32.0
75%,68.0,34.0,58.0,83.0,29.59,12.46,86.36,59.275,101.26,6.923643,124.267508,84.25,68.0,49.0
max,90.0,45.0,70.0,110.0,45.56,31.36,96.0,133.33,101.86,9.935091,298.560117,140.0,145.0,205.0


In [5]:
missing_values = df.isnull().sum()
print("\n=== Missing Values per Column ===")
print(missing_values)


=== Missing Values per Column ===
Soil Moisture              0
Temperature                0
 Soil Humidity             0
Time                       0
Air temperature (C)    76005
Wind speed (Km/h)      76005
Air humidity (%)       76005
Wind gust (Km/h)       76005
Pressure (KPa)         76005
ph                     97800
rainfall               97800
N                      97800
P                      97800
K                      97800
Status                     0
dtype: int64


In [6]:
features = ['Soil Moisture', 'Temperature', ' Soil Humidity', 'Air temperature (C)', 'Air humidity (%)']
df['Status'] = df['Status'].apply(lambda x: 'ON' if str(x).strip().upper() == 'ON' else 'OFF')
X = df[features]
y = df['Status']

In [7]:
numeric_columns = X.select_dtypes(include=[np.number]).columns

In [8]:
from sklearn.impute import KNNImputer

In [9]:
imputer = KNNImputer(n_neighbors=5)
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, ParameterGrid
from sklearn.metrics import f1_score, make_scorer, classification_report, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc

In [None]:
print("Overall class distribution:")
print(y.value_counts())

In [None]:
y = y.apply(lambda x: 1 if x=="ON" else 0)

In [None]:
print("Overall class distribution:")
print(y.value_counts())

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.1, stratify=y_train_val, random_state=42)

In [None]:
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")

In [None]:
print(y_train.value_counts())
print(y_val.value_counts())
print(y_test.value_counts())

In [None]:
param_grid = {
    'num_leaves': [31, 63, 127],
    'max_depth': [5, 8, -1],
    'learning_rate': [0.01, 0.05],
    'n_estimators': [100, 200, 300],
    'min_child_samples': [1, 5],
    'reg_alpha': [0.0, 0.1],
    'reg_lambda': [0.0, 0.1],
}

In [None]:
lgbm_clf = lgb.LGBMClassifier(random_state=69)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)

In [None]:
grid_search = GridSearchCV(
    estimator=lgbm_clf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=cv,
    verbose=1,
    n_jobs=-1
)

In [None]:
%pip install tqdm_joblib

In [None]:
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib

In [None]:
n_candidates = len(list(ParameterGrid(param_grid)))
n_total = cv.get_n_splits() * n_candidates

In [None]:
with tqdm_joblib(tqdm(desc="Grid Search", total=n_total)):
    grid_search.fit(X_train, y_train)

In [None]:
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

In [None]:
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
print("\nValidation Set Evaluation:")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

In [None]:
y_test_pred = best_model.predict(X_test)
print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

In [None]:
X_final_train = pd.concat([X_train, X_val])
y_final_train = pd.concat([y_train, y_val])
final_model = best_model.fit(X_final_train, y_final_train)

In [None]:
y_test_pred = final_model.predict(X_test)
print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

In [None]:
y_test_prob = final_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_test_prob)
print(f"ROC AUC Score: {roc_auc:.4f}")

In [None]:
fpr, tpr, thresholds_roc = roc_curve(y_test, y_test_prob)
precision, recall, thresholds_pr = precision_recall_curve(y_test, y_test_prob)
pr_auc = auc(recall, precision)
print(f"Precision-Recall AUC: {pr_auc:.4f}")

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')

In [None]:
plt.subplot(1, 2, 2)
plt.plot(recall, precision, label=f'PR Curve (AUC = {pr_auc:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
import joblib
import os

In [None]:
model_filename = "final_model.pkl"
joblib.dump(final_model, model_filename)

In [None]:
model_size_bytes = os.path.getsize(model_filename)
model_size_kb = model_size_bytes / 1024
print(f"Model size: {model_size_bytes} bytes ({model_size_kb:.2f} KB)")