# Data Collection

In [None]:
import pandas as pd
# Read data from a excel file
data = pd.read_excel('Data Visualization Technique in Smart Agriculture.xlsx')

# Data Exploration

In [None]:
# Check the number of rows and columns
data.shape

In [None]:
# View column names
data.columns

In [None]:
# Preview first few rows
data.head()

In [None]:
# Preview last few rows
data.tail()

In [None]:
# give only those columns which does not contain string data 
data.describe()

In [None]:
# tells deep information about columns
data.info()

In [None]:
# tells the unique values of each column
data.nunique()

In [None]:
# we can also check seperatly 
data['Water Required'].unique()

# Data Cleaning

In [None]:
# checking null values
data.isnull().sum()

In [None]:
# droping redundant/duplicate data
clean_data = data.drop(['Soil Moisture Raw Value', 'N_For_User', 'P_For_User', 'K_For_User'], axis=1)

# now check
clean_data.columns

# Summary Statistics

In [None]:
# average
clean_data.mean()

In [None]:
# middle value of dataset
clean_data.median()

In [None]:
# most repeated value
clean_data.mode()

# Data Visualization

In [None]:
# Histogram
import matplotlib.pyplot as plt
import pandas as pd

# Convert the 'Data Added' column to datetime (from a string format to a datetime object)
clean_data['Data Added'] = pd.to_datetime(clean_data['Data Added'], format='%d-%b-%Y %I:%M %p')

# Extract year and month for aggregation
clean_data['YearMonth'] = clean_data['Data Added'].dt.to_period('M')

# Function to plot histogram with monthly aggregation for numeric features
def plot_histogram_with_date(data, feature, title, xlabel, ylabel='Frequency'):
    monthly_counts = data.groupby(['YearMonth'])[feature].mean()
    monthly_counts.plot(kind='bar', figsize=(12, 6))
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=45)
    plt.show()

# Function to plot histogram with monthly aggregation for categorical features
# The .unstack() method reshapes the resulting series into a DataFrame
def plot_histogram_with_date_categorical(data, feature, title, xlabel, ylabel='Frequency'):
    monthly_counts = data.groupby(['YearMonth'])[feature].value_counts().unstack()
    monthly_counts.plot(kind='bar', stacked=True, figsize=(12, 6))
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=45)
    plt.legend(title=feature)
    plt.show()

# Plot histograms for the required features
plot_histogram_with_date(clean_data, 'Heat Index', "Average Heat Index Over Time", "Year-Month", "Average Heat Index")
plot_histogram_with_date(clean_data, 'Soil Moisture', "Average Soil Moisture Over Time", "Year-Month", "Average Soil Moisture")
plot_histogram_with_date(clean_data, 'Soil Temp', "Average Soil Temperature Over Time", "Year-Month", "Average Soil Temperature")
plot_histogram_with_date(clean_data, 'Nitrogen', "Average Nitrogen Level Over Time", "Year-Month", "Average Nitrogen Level")
plot_histogram_with_date(clean_data, 'Phosphorus', "Average Phosphorus Level Over Time", "Year-Month", "Average Phosphorus Level")
plot_histogram_with_date(clean_data, 'Potassium', "Average Potassium Level Over Time", "Year-Month", "Average Potassium Level")

# Plot histograms for categorical features
plot_histogram_with_date_categorical(clean_data, 'Salinity', "Salinity Over Time", "Year-Month", "Frequency")
plot_histogram_with_date_categorical(clean_data, 'Water Required', "Water Required Over Time", "Year-Month", "Frequency")

In [None]:
import matplotlib.pyplot as plt

# inversely propotional 
plt.scatter(clean_data['Temperature'], clean_data['Humidity'], color='red', marker='o', label='Temperature')
plt.scatter(clean_data['Temperature'], clean_data['Humidity'], color='blue', marker='x', label='Humidity')
plt.title('Temperature vs. Humidity')
plt.xlabel('Temperature')
plt.ylabel('Humidity')
plt.legend()
plt.show()

# directly propotional
plt.scatter(clean_data['Temperature'], clean_data['API Temperature'], color='red', marker='o', label='Temperature')
plt.scatter(clean_data['Temperature'], clean_data['API Temperature'], color='blue', marker='x', label='API Temperature')
plt.title('Temperature vs. API Temperature')
plt.xlabel('Temperature')
plt.ylabel('API Temperature')
plt.legend()
plt.show()

# increase  in temperature  is  associated with  a  noticeable  rise  in  the  heat  index
plt.scatter(clean_data['Temperature'], clean_data['Heat Index'], color='red', marker='o', label='Temperature')
plt.scatter(clean_data['Temperature'], clean_data['Heat Index'], color='blue', marker='x', label='Heat Index')
plt.title('Temperature vs. Heat Index')
plt.xlabel('Temperature')
plt.ylabel('Heat Index')
plt.legend()
plt.show()

plt.scatter(clean_data['Soil Moisture'], clean_data['Soil Temp'], color='red', marker='o', label='Soil Moisture')
plt.scatter(clean_data['Soil Moisture'], clean_data['Soil Temp'], color='blue', marker='x', label='Soil Temp')
plt.title('Soil Moisture vs. Soil Temp')
plt.xlabel('Soil Moisture')
plt.ylabel('Soil Temp')
plt.legend()
plt.show()

In [None]:
# bar chart
import matplotlib.pyplot as plt
import pandas as pd

# Convert the 'Data Added' column to datetime
clean_data['Data Added'] = pd.to_datetime(clean_data['Data Added'], format='%d-%b-%Y %I:%M %p')

# Extract year and month for aggregation
clean_data['YearMonth'] = clean_data['Data Added'].dt.to_period('M')

# Function to plot bar chart with date aggregation for categorical features
def plot_bar_chart_with_date_categorical(data, feature, title, xlabel, ylabel='Frequency'):
    monthly_counts = data.groupby(['YearMonth'])[feature].value_counts().unstack()
    monthly_counts.plot(kind='bar', stacked=True, figsize=(12, 6))
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=45)
    plt.legend(title=feature)
    plt.show()

# Plot bar charts for categorical features
plot_bar_chart_with_date_categorical(clean_data, 'Salinity', "Salinity Over Time", "Year-Month", "Frequency")
plot_bar_chart_with_date_categorical(clean_data, 'Water Required', "Water Required Over Time", "Year-Month", "Frequency")


# Correlation Analysis

In [None]:
import seaborn as sns

# Select numeric columns
numeric_columns = clean_data.select_dtypes(include=['float64', 'int64'])

# Calculate correlation matrix
correlation_matrix = numeric_columns.corr()

# Heatmap of correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
correlation_matrix = clean_data[['Humidity', 'API Humidity', 'Temperature', 'API Temperature', 'Heat Index', 'Soil Moisture', 'Soil Temp', 'Nitrogen','Phosphorus', 'Potassium', 'TDS', 'E_D', 'Soil_EC Raw']].corr()

# Heatmap of correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Outlier Detection

In [None]:
import pandas as pd
from scipy.stats import zscore

# where the Z-score is greater than 3 or less than -3. These cutoff values (3 and -3) are commonly used thresholds to 
# identify extreme outliers based on the Z-score method.
# Z-score method for Heat Index
clean_data['z_score'] = zscore(clean_data['Heat Index'])
outliers = clean_data[(clean_data['z_score'] > 3) | (clean_data['z_score'] < -3)]
print("Outliers of Heat Index:")
print(outliers)
print()

# Z-score method for Soil Moisture
clean_data['z_score'] = zscore(clean_data['Soil Moisture'])
outliers = clean_data[(clean_data['z_score'] > 3) | (clean_data['z_score'] < -3)]
print("Outliers of Soil Moisture:")
print(outliers)
print()

# Z-score method for Soil Temp
clean_data['z_score'] = zscore(clean_data['Soil Temp'])
outliers = clean_data[(clean_data['z_score'] > 3) | (clean_data['z_score'] < -3)]
print("Outliers of Soil Temp:")
print(outliers)
print()

# Z-score method for Nitrogen
clean_data['z_score'] = zscore(clean_data['Nitrogen'])
outliers = clean_data[(clean_data['z_score'] > 3) | (clean_data['z_score'] < -3)]
print("Outliers of Nitrogen:")
print(outliers)
print()

# Z-score method for Phosphorus
clean_data['z_score'] = zscore(clean_data['Phosphorus'])
outliers = clean_data[(clean_data['z_score'] > 3) | (clean_data['z_score'] < -3)]
print("Outliers of Phosphorus:")
print(outliers)
print()

# Z-score method for Potassium
clean_data['z_score'] = zscore(clean_data['Potassium'])
outliers = clean_data[(clean_data['z_score'] > 3) | (clean_data['z_score'] < -3)]
print("Outliers of Potassium:")
print(outliers)

# Feature Selection

In [None]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

# Encode categorical data
label_encoder = LabelEncoder()
clean_data['Water Required'] = label_encoder.fit_transform(clean_data['Water Required'])
clean_data['Salinity'] = label_encoder.fit_transform(clean_data['Salinity'])

# Exclude non-numeric columns, particularly dates
numeric_cols = clean_data.select_dtypes(include=[np.number]).columns.tolist()

# Separate features (X) and target variable (y)
X = clean_data[numeric_cols].drop(columns=['Water Required']) 
y = clean_data['Water Required']

# Calculate feature importance using mutual information gain
importance = mutual_info_classif(X, y)

# Ensure that the index for feature importance correctly excludes the target column
# the index consists of feature names and the values are the corresponding mutual information scores
feat_importance = pd.Series(importance, index=X.columns)
feat_importance.sort_values(ascending=False, inplace=True)

# Plot feature importance as a horizontal bar plot
feat_importance.plot(kind='barh', color='teal')

In [None]:
def determine_crop_health_cotton(row):
    healthy_conditions = 0

    # Temperature suitable for cotton is between 20°C and 30°C
    if 20 <= row['Temperature'] <= 32:
        healthy_conditions += 1

    # Humidity: Cotton does well in moderate humidity (30-60% is generally good)
    if 50 <= row['Humidity'] <= 64:
        healthy_conditions += 1

    # Soil Moisture: Adjust range to reflect realistic agricultural practice
    if 40 <= row['Soil Moisture'] <= 70:
        healthy_conditions += 1

    # Soil Temperature: Cotton root systems are sensitive to extreme temperatures
    if 30 <= row['Soil Temp'] <= 60:
        healthy_conditions += 1

    # Salinity should be low for cotton but some tolerance is observed
    if row['Salinity'] == 'low':
        healthy_conditions += 1

    # Adjust nutrient levels to more realistic expectations based on agronomic data
    if 20 <= row['Phosphorus'] <= 65:
        healthy_conditions += 1
    if 80 <= row['Potassium'] <= 180:
        healthy_conditions += 1
    if 10 <= row['Nitrogen'] <= 50:
        healthy_conditions += 1

    # Define cotton as healthy if at least 5 of the 8 conditions are met
    if healthy_conditions >= 3:
        return 'Healthy'
    else:
        return 'Unhealthy'

# Convert the 'Data Added' column to datetime
clean_data['Data Added'] = pd.to_datetime(clean_data['Data Added'], format='%d-%b-%Y %I:%M %p')

# Extract year and month for aggregation
clean_data['YearMonth'] = clean_data['Data Added'].dt.to_period('M')

# Function to plot bar chart with date aggregation for categorical features
def plot_bar_chart_with_date_categorical(data, feature, title, xlabel, ylabel='Frequency'):
    monthly_counts = data.groupby(['YearMonth'])[feature].value_counts().unstack()
    monthly_counts.plot(kind='bar', stacked=True, figsize=(12, 6))
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=45)
    plt.legend(title=feature)
    plt.show()

#  Apply the updated function to create the 'Crop Health' column
clean_data['Crop Health'] = clean_data.apply(determine_crop_health_cotton, axis=1)

# Plot bar chart for healthy vs unhealthy crops with date aggregation
plot_bar_chart_with_date_categorical(clean_data, 'Crop Health', "Cotton Crop Health Over Time", "Year-Month", "Frequency")

# Algorithms 

In [None]:
x = clean_data[['Humidity', 'Temperature', 'Heat Index', 'Soil Moisture', 'Soil Temp', 'TDS', 'ETo', 'ETc', 'E_D', 'Soil_EC Raw']]
y = clean_data['Water Required']

x_crop = clean_data[['Temperature', 'Humidity', 'Soil Moisture', 'Soil Temp', 'Salinity', 'Nitrogen', 'Phosphorus', 'Potassium']]

# 1. KNN

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and testing sets
X_train_wr, X_test_wr, y_train_wr, y_test_wr = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize the KNN model
# 5..means that the algorithm will consider the labels of the 5 nearest data points to the new data point being 
# classified.  The prediction for the new data point is then determined by a majority vote among its nearest neighbors. 
knn_wr = KNeighborsClassifier(n_neighbors=5)

# Train the KNN model
knn_wr.fit(X_train_wr, y_train_wr)

# Make predictions on the test set
y_pred_wr = knn_wr.predict(X_test_wr)

# Evaluate the model's performance
accuracy_wr = accuracy_score(y_test_wr, y_pred_wr)
report_wr = classification_report(y_test_wr, y_pred_wr)

print(f'Accuracy for Water Required prediction: {accuracy_wr}')
print(f'Classification Report for Water Required prediction:\n{report_wr}')

# True Positive (TP): The model correctly predicts an email as spam when it is actually spam.
# False Positive (FP): The model incorrectly predicts an email as spam when it is actually not spam (ham). 
# False Negative (FN): The model incorrectly predicts an email as not spam (ham) when it is actually spam. 
# True Negative (TN): The model correctly predicts an email as not spam (ham) when it is actually not spam.

# precision: Precision measures the proportion of true positive predictions (correctly classified instances) among 
# all instances predicted as positive. It indicates the model's ability to avoid false positives.
# Out of all emails predicted as spam, 75% were actually spam.

# Recall: Recall (also called sensitivity) measures the proportion of true positive predictions among all 
# actual positive instances. It indicates the model's ability to capture all positive instances
# Out of all actual spam emails, 100% were correctly predicted as spam.

In [None]:
# Separate features and target variable for Crop Health
y_crop_health = clean_data['Crop Health']

# Split the data into training and testing sets
X_train_ch, X_test_ch, y_train_ch, y_test_ch = train_test_split(x_crop, y_crop_health, test_size=0.2, random_state=42)

# Initialize the KNN model
knn_ch = KNeighborsClassifier(n_neighbors=5)

# Train the KNN model
knn_ch.fit(X_train_ch, y_train_ch)

# Make predictions on the test set
y_pred_ch = knn_ch.predict(X_test_ch)

# Evaluate the model's performance
accuracy_ch = accuracy_score(y_test_ch, y_pred_ch)
report_ch = classification_report(y_test_ch, y_pred_ch)

print(f'Accuracy for Crop Health prediction: {accuracy_ch}')
print(f'Classification Report for Crop Health prediction:\n{report_ch}')

# 2. Naive Bayer Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Separate features and target variable for Water Required
y_water_required = clean_data['Water Required']

# Split the data into training and testing sets
X_train_wr, X_test_wr, y_train_wr, y_test_wr = train_test_split(x, y_water_required, test_size=0.2, random_state=42)

# Initialize the Naive Bayes classifier
nb_wr = GaussianNB()

# Train the Naive Bayes classifier
nb_wr.fit(X_train_wr, y_train_wr)

# Make predictions on the test set
y_pred_wr_nb = nb_wr.predict(X_test_wr)

# Evaluate the model's performance
confusionMatrix_wr_nb = confusion_matrix(y_test_wr, y_pred_wr_nb)
accuracy_wr_nb = accuracy_score(y_test_wr, y_pred_wr_nb)
report_wr_nb = classification_report(y_test_wr, y_pred_wr_nb)

print(f'confusionMatrix for Water Required prediction (Naive Bayes):\n {confusionMatrix_wr_nb}')
print(f'Accuracy for Water Required prediction (Naive Bayes): {accuracy_wr_nb}')
print(f'Classification Report for Water Required prediction (Naive Bayes):\n{report_wr_nb}')

# There are 451 true positive (instances correctly predicted as class 1).
# There are 0 false positives (instances incorrectly predicted as class 1).
# There are 0 false negatives (instances incorrectly predicted as class 0).
# There are 149 true negative (instances correctly predicted as class 0).

In [None]:
# Separate features and target variable for Crop Health
y_crop_health = clean_data['Crop Health']

# Split the data into training and testing sets
X_train_ch, X_test_ch, y_train_ch, y_test_ch = train_test_split(x_crop, y_crop_health, test_size=0.2, random_state=42)

# Initialize the Naive Bayes classifier
nb_ch = GaussianNB()

# Train the Naive Bayes classifier
nb_ch.fit(X_train_ch, y_train_ch)

# Make predictions on the test set
y_pred_ch_nb = nb_ch.predict(X_test_ch)

# Evaluate the model's performance
confusionMatrix_ch_nb = confusion_matrix(y_test_ch, y_pred_ch_nb)
accuracy_ch_nb = accuracy_score(y_test_ch, y_pred_ch_nb)
report_ch_nb = classification_report(y_test_ch, y_pred_ch_nb)

print(f'confusionMatrix for Crop Health prediction (Naive Bayes):\n {confusionMatrix_ch_nb}')
print(f'Accuracy for Crop Health prediction (Naive Bayes): {accuracy_ch_nb}')
print(f'Classification Report for Crop Health prediction (Naive Bayes):\n{report_ch_nb}')

# 3. Support Vector Machines

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y_water_required, test_size=0.2, random_state=42)

# Initialize and train the SVM model with a radial basis function (RBF) kernel
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model's performance
conf_matrix_svm = confusion_matrix(y_test, y_pred)
accuracy_svm = accuracy_score(y_test, y_pred)
report_svm = classification_report(y_test, y_pred)

print(f'Confusion Matrix for Water Required prediction (SVM):\n {conf_matrix_svm}')
print(f'Accuracy for Water Required prediction (SVM): {accuracy_svm:.2f}')
print(f'Classification Report for Water Required prediction (SVM):\n{report_svm}')


In [None]:
# Separate features and target variable for Crop Health
y_crop_health = clean_data['Crop Health']

# Split the data into training and testing sets
X_train_ch, X_test_ch, y_train_ch, y_test_ch = train_test_split(x_crop, y_crop_health, test_size=0.2, random_state=42)

# Initialize and train the SVM model with a radial basis function (RBF) kernel
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model's performance
conf_matrix_ch_svm = confusion_matrix(y_test, y_pred)
accuracy_ch_svm = accuracy_score(y_test, y_pred)
report_ch_svm = classification_report(y_test, y_pred)

print(f'Confusion Matrix for Crop Health prediction (SVM):\n {conf_matrix_svm}')
print(f'Accuracy for Crop Health prediction (SVM): {accuracy_svm:.2f}')
print(f'Classification Report for Crop Health prediction (SVM):\n{report_svm}')

# Visualization Techniques

# 1. Best Model (Bar Chart)

In [None]:
import matplotlib.pyplot as plt

# Provided accuracies from the models
accuracy_knn = accuracy_wr
accuracy_nb = accuracy_wr_nb
accuracy_svm = accuracy_svm

# Labels for the bar chart
models = ['KNN', 'Naive Bayes', 'SVM']
accuracies = [accuracy_knn, accuracy_nb, accuracy_svm]

# Create the bar chart
plt.figure(figsize=(10, 6))
plt.bar(models, accuracies, color=['blue', 'green', 'red'])
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison for Water Required')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Provided accuracies from the models
accuracy_knn = accuracy_ch
accuracy_nb = accuracy_ch_nb
accuracy_svm = accuracy_ch_svm

# Labels for the bar chart
models = ['KNN', 'Naive Bayes', 'SVM']
accuracies = [accuracy_knn, accuracy_nb, accuracy_svm]

# Create the bar chart
plt.figure(figsize=(10, 6))
plt.bar(models, accuracies, color=['blue', 'green', 'red'])
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison for Crop Health')
plt.show()

# 2. Line Chart

In [None]:
import matplotlib.pyplot as plt

# Accuracy scores for each model
accuracy_scores = {
    'KNN': accuracy_wr,
    'Naive Bayes': accuracy_wr_nb,
    'SVM': accuracy_svm
}

# Extracting model names and corresponding accuracies
models = list(accuracy_scores.keys())
accuracies = list(accuracy_scores.values())

plt.figure(figsize=(10, 6))
plt.plot(models, accuracies, marker='o', linestyle='-', color='b', label='Accuracy')

plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison of Models for Water Required')
plt.ylim(0, 1)
plt.grid(True)
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt

accuracy_scores = {
    'KNN': accuracy_ch,
    'Naive Bayes': accuracy_ch_nb,
    'SVM': accuracy_ch_svm
}

# Extracting model names and corresponding accuracies
models = list(accuracy_scores.keys())
accuracies = list(accuracy_scores.values())

plt.figure(figsize=(10, 6))
plt.plot(models, accuracies, marker='o', linestyle='-', color='b', label='Accuracy')

plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison of Models for Crop Health')
plt.ylim(0, 1)
plt.grid(True)
plt.legend()

plt.show()

# 3. Pie Chart

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Define the unique labels and their counts for water required prediction
water_required_labels, water_required_counts = np.unique(y_pred_wr, return_counts=True)

# Plotting the pie chart for water required prediction
plt.figure(figsize=(8, 6))
plt.pie(water_required_counts, labels=water_required_labels, autopct='%1.1f%%', startangle=140)
plt.title('Water Required Prediction')
plt.axis('equal')    # Pie chart is drawn as a circle rather than an ellipse.

# Add a text annotation for date-based information
# The x and y coordinates for the position of the text annotation on the plot.
plt.text(-1.5, 1.2, 'Data from 11 Feb 2024 to 25 Aug 2024', fontsize=12, color='gray')

plt.show()

In [None]:
# Define the unique labels and their counts for crop health prediction
crop_health_labels, crop_health_counts = np.unique(y_pred_ch, return_counts=True)

# Plotting the pie chart for crop health prediction
plt.figure(figsize=(8, 6))
plt.pie(crop_health_counts, labels=crop_health_labels, autopct='%1.1f%%', startangle=140)
plt.title('Crop Health Prediction')
plt.axis('equal')  

# Add a text annotation for date-based information
plt.text(-1.5, 1.2, 'Data from 11 Feb 2024 to 25 Aug 2024', fontsize=12, color='gray')

plt.show()

# 4. 3D Visualization

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 7))
# specifies the position of the subplot (1 row, 1 column, first subplot).
ax = fig.add_subplot(111, projection='3d')     #adds a 3D subplot to the figure.
sc = ax.scatter(clean_data['Temperature'], clean_data['Humidity'], clean_data['Soil Moisture'], c=clean_data['Water Required'], cmap='viridis', marker='o')
ax.set_xlabel('Temperature')
ax.set_ylabel('Humidity')
ax.set_zlabel('Soil Moisture')
plt.colorbar(sc, label='Water Required')
plt.title('3D Scatter Plot of Temperature, Humidity, and Soil Moisture')
plt.show()

# Each point in the scatter plot represents a data entry from the dataset, with its position determined by the 
# temperature, humidity, and soil moisture values.

# The color of the data points ranges from purple to yellow, indicating the amount of water required. 
# Purple indicates lower water requirements, and yellow indicates higher water requirements.

# The color gradient helps in understanding how the water requirement changes with respect to temperature, 
# humidity, and soil moisture.

In [None]:
X = clean_data['Temperature']
Y = clean_data['Humidity']
Z = clean_data['Soil Moisture']

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_trisurf(X, Y, Z, cmap='viridis', edgecolor='none')
ax.set_xlabel('Temperature')
ax.set_ylabel('Humidity')
ax.set_zlabel('Soil Moisture')
fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5)
plt.title('3D Surface Plot of Temperature, Humidity, and Soil Moisture')
plt.show()

# The surface plot shows how soil moisture varies with temperature and humidity.

# For instance, if the yellow areas are concentrated at lower temperatures and higher humidity, 
# it suggests that soil moisture is higher in those condition

# 5. Area Chart

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Generate random predictions for water required with 'Yes' having a 40% chance and 'No' having a 60% chance.
y_pred_wr = np.random.choice(['Yes', 'No'], size=100, p=[0.4, 0.6])

# Convert the 'Data Added' column to datetime
clean_data['Data Added'] = pd.to_datetime(clean_data['Data Added'], format='%d-%b-%Y %I:%M %p')

# Extract year and month for aggregation
clean_data['YearMonth'] = clean_data['Data Added'].dt.to_period('M')

# Function to plot area chart with date aggregation for categorical features
def plot_area_chart_with_date_categorical(data, feature, title, xlabel, ylabel='Count'):
    monthly_counts = data.groupby(['YearMonth'])[feature].value_counts().unstack().fillna(0)
    monthly_counts.plot(kind='area', stacked=True, figsize=(10, 6))
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=45)
    plt.legend(title=feature)
    plt.show()

# Plot area chart for water required prediction with date aggregation
plot_area_chart_with_date_categorical(clean_data, 'Water Required', "Water Required Prediction Over Time", "Year-Month", "Count")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Generate random predictions for crop health
y_pred_ch = np.random.choice(['Healthy', 'Unhealthy'], size=100, p=[0.6, 0.4])

# Get unique labels and their counts for crop health prediction
crop_health_labels, crop_health_counts = np.unique(y_pred_ch, return_counts=True)

# Plot area chart for crop health prediction with date aggregation
plot_area_chart_with_date_categorical(clean_data, 'Crop Health', "Crop Health Prediction Over Time", "Year-Month", "Count")
