In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
df_old = pd.read_csv("../data/raw/crop_and_soil_dataSet.csv")
print(df_old.head())  # Inspect the first few rows of the dataset

   Temparature  Humidity  Moisture Soil Type  Crop Type  Nitrogen  Potassium  \
0         26.0      52.0      38.0     Sandy      Maize        37          0   
1         29.0      52.0      45.0     Loamy  Sugarcane        12          0   
2         34.0      65.0      62.0     Black     Cotton         7          9   
3         32.0      62.0      34.0       Red    Tobacco        22          0   
4         28.0      54.0      46.0    Clayey      Paddy        35          0   

   Phosphorous Fertilizer Name  
0            0            Urea  
1           36             DAP  
2           30        14-35-14  
3           20           28-28  
4            0            Urea  


In [8]:
# Identify non-numeric columns
non_numeric_columns = df_old.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

# If you have categorical columns, you can either drop them or encode them
# For now, let's drop non-numeric columns for simplicity
df_numeric = df_old.drop(columns=non_numeric_columns)

Non-numeric columns: Index(['Soil Type', 'Crop Type', 'Fertilizer Name'], dtype='object')


In [10]:
# Normalize the numeric features using Min-Max scaling
scaler = MinMaxScaler(feature_range=(-1, 1))
df_scaled = scaler.fit_transform(df_numeric)

# Convert the scaled data to a DataFrame for easier handling
df_scaled = pd.DataFrame(df_scaled, columns=df_numeric.columns)

# Convert to NumPy array for training the GAN
X_train = df_scaled.to_numpy()

# Define the generator model
def build_generator(input_dim, output_dim):
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, activation="relu", input_dim=input_dim))
    model.add(layers.Dense(256, activation="relu"))
    model.add(layers.Dense(512, activation="relu"))
    model.add(layers.Dense(output_dim, activation="tanh"))  # Output shape matches the number of features in the data
    return model

# Define the discriminator model
def build_discriminator(input_dim):
    model = tf.keras.Sequential()
    model.add(layers.Dense(512, activation="relu", input_dim=input_dim))
    model.add(layers.Dense(256, activation="relu"))
    model.add(layers.Dense(128, activation="relu"))
    model.add(layers.Dense(1, activation="sigmoid"))  # Output is binary: real or fake
    return model

# Compile the discriminator
discriminator = build_discriminator(X_train.shape[1])
discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create the generator with 100 latent space dimensions and 8 features in the output
generator = build_generator(100, X_old_scaled.shape[1])  # X_old_scaled.shape[1] gives the number of features

# Generate new fake data using the trained GAN (latent space dimension is 100)
noise = np.random.normal(0, 1, (X_old_scaled.shape[0], 100))  # Generate the same number of data points as the old dataset
X_generated = generator.predict(noise)

# Define the GAN model (stack generator and discriminator)
z = layers.Input(shape=(100,))
generated_data = generator(z)
discriminator.trainable = False  # Freeze the discriminator when training the GAN
validity = discriminator(generated_data)

# Define the combined model (generator + discriminator)
gan = tf.keras.Model(z, validity)
gan.compile(loss='binary_crossentropy', optimizer='adam')

# Function to train the GAN
def train_gan(epochs, batch_size=128, save_interval=50):
    half_batch = batch_size // 2

    # Training loop
    for epoch in range(epochs):
        # Train discriminator
        idx = np.random.randint(0, X_train.shape[0], half_batch)
        real_data = X_train[idx]

        noise = np.random.normal(0, 1, (half_batch, 100))
        fake_data = generator.predict(noise)

        # Labels for real and fake data
        real_labels = np.ones((half_batch, 1))
        fake_labels = np.zeros((half_batch, 1))

        # Train the discriminator
        d_loss_real = discriminator.train_on_batch(real_data, real_labels)
        d_loss_fake = discriminator.train_on_batch(fake_data, fake_labels)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Train the generator
        noise = np.random.normal(0, 1, (batch_size, 100))
        valid_labels = np.ones((batch_size, 1))

        g_loss = gan.train_on_batch(noise, valid_labels)

        # Print the progress
        if epoch % save_interval == 0:
            print(f"{epoch} [D loss: {d_loss[0]} | D accuracy: {100 * d_loss[1]}] [G loss: {g_loss}]")
            if epoch % (save_interval * 10) == 0:
                save_generated_data(epoch)

# Function to save generated data
def save_generated_data(epoch, examples=10):
    noise = np.random.normal(0, 1, (examples, 100))
    generated_data = generator.predict(noise)

    # Scale the generated data back to [0, 1] range (inverse transformation)
    generated_data = (generated_data + 1) / 2  # Convert from [-1, 1] to [0, 1]

    plt.figure(figsize=(10, 5))
    for i in range(examples):
        plt.subplot(2, 5, i + 1)
        plt.plot(generated_data[i])  # Assume each data is 1D, you can modify this based on your data structure
        plt.axis('off')
    plt.tight_layout()
    plt.savefig(f"gan_generated_data_epoch_{epoch}.png")
    plt.close()

# Start training
epochs = 500
batch_size = 64
train_gan(epochs, batch_size)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step




0 [D loss: 0.6709879636764526 | D accuracy: 75.0] [G loss: 0.6811626553535461]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import tensorflow as tf

# Load the old dataset
df_old = pd.read_csv("../data/raw/crop_and_soil_dataSet.csv")
print(df_old.head())  # Inspect the first few rows

# Preprocessing the old dataset
# Identify non-numeric columns
non_numeric_columns = df_old.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

# Encode non-numeric columns using LabelEncoder
label_encoders = {}
for column in non_numeric_columns:
    encoder = LabelEncoder()
    df_old[column] = encoder.fit_transform(df_old[column])  # Encode the column
    label_encoders[column] = encoder  # Store the encoder for potential inverse transform

# Separate features (X) and labels (y)
X_old = df_old.iloc[:, :-1]  # All columns except the last column as features
y_old = df_old.iloc[:, -1]   # The last column as the label

# Normalize the old dataset
scaler = MinMaxScaler(feature_range=(-1, 1))
X_old_scaled = scaler.fit_transform(X_old)

# --- GAN Generation Section ---
# Assuming you have a trained GAN model from the previous steps

# Generate new fake data using the trained GAN (latent space dimension is 100)
noise = np.random.normal(0, 1, (X_old_scaled.shape[0], 100))  # Generate the same number of data points as the old dataset
X_generated = generator.predict(noise)

# Combine the old and generated data
X_combined = np.vstack([X_old_scaled, X_generated])
y_combined = np.hstack([y_old, y_old])  # Assuming you are labeling the generated data with the same labels

# Split the combined dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Train the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the DecisionTreeClassifier on the combined dataset: {accuracy:.4f}")

   Temparature  Humidity  Moisture Soil Type  Crop Type  Nitrogen  Potassium  \
0         26.0      52.0      38.0     Sandy      Maize        37          0   
1         29.0      52.0      45.0     Loamy  Sugarcane        12          0   
2         34.0      65.0      62.0     Black     Cotton         7          9   
3         32.0      62.0      34.0       Red    Tobacco        22          0   
4         28.0      54.0      46.0    Clayey      Paddy        35          0   

   Phosphorous Fertilizer Name  
0            0            Urea  
1           36             DAP  
2           30        14-35-14  
3           20           28-28  
4            0            Urea  
Non-numeric columns: Index(['Soil Type', 'Crop Type', 'Fertilizer Name'], dtype='object')
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step  


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 8 and the array at index 1 has size 6