In [3]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

# Load the dataset
df = pd.read_csv(r'D:\Internship 4 week\Week_1\project_week1\crop_prediction_balanced_100k.csv')

# Check for null values
print("Null values before cleaning:")
print(df.isnull().sum())

# Remove the row with negative yield
df = df[df['Yield_tons_per_hectare'] >= 0]

# Check for any remaining null values
df = df.dropna()

# Check class distribution
print("\nCrop distribution:")
print(df['Crop'].value_counts())

# Balance the dataset by upsampling minority classes
# Separate majority and minority classes
df_majority = df[df['Crop'] == 'Wheat']
df_minority1 = df[df['Crop'] == 'Rice']
df_minority2 = df[df['Crop'] == 'Maize']
df_minority3 = df[df['Crop'] == 'Barley']
df_minority4 = df[df['Crop'] == 'Soybean']
df_minority5 = df[df['Crop'] == 'Cotton']

# Upsample minority classes
df_minority1_upsampled = resample(df_minority1, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority), # match majority class
                                 random_state=42)  # reproducible results

df_minority2_upsampled = resample(df_minority2, 
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_minority3_upsampled = resample(df_minority3, 
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_minority4_upsampled = resample(df_minority4, 
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_minority5_upsampled = resample(df_minority5, 
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

# Combine majority and upsampled minority classes
df_balanced = pd.concat([df_majority, df_minority1_upsampled, df_minority2_upsampled, 
                        df_minority3_upsampled, df_minority4_upsampled, df_minority5_upsampled])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the balanced distribution
print("\nBalanced crop distribution:")
print(df_balanced['Crop'].value_counts())

# Save the cleaned and balanced dataset
df_balanced.to_csv('cleaned_crop_yield_dataset.csv', index=False)

print(f"\nCleaned dataset saved with {len(df_balanced)} records")

Null values before cleaning:
Region                    0
Soil_Type                 0
Crop                      0
Rainfall_mm               0
Temperature_Celsius       0
Fertilizer_Used           0
Irrigation_Used           0
Weather_Condition         0
Days_to_Harvest           0
Yield_tons_per_hectare    0
dtype: int64

Crop distribution:
Crop
Cotton     16665
Wheat      16664
Soybean    16664
Maize      16662
Rice       16661
Barley     16660
Name: count, dtype: int64

Balanced crop distribution:
Crop
Maize      16664
Barley     16664
Wheat      16664
Rice       16664
Soybean    16664
Cotton     16664
Name: count, dtype: int64

Cleaned dataset saved with 99984 records


In [4]:
# Create an expanded dataset with 50,000 records
def expand_dataset(df, target_size=50000):
    # Calculate how many times we need to replicate the data
    replication_factor = target_size // len(df) + 1
    
    # Replicate the dataset
    expanded_df = pd.concat([df] * replication_factor, ignore_index=True)
    
    # Add some random noise to numerical columns to create variety
    numerical_cols = ['Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest', 'Yield_tons_per_hectare']
    
    for col in numerical_cols:
        # Add small random variations (5% of standard deviation)
        noise = np.random.normal(0, df[col].std() * 0.05, len(expanded_df))
        expanded_df[col] = expanded_df[col] + noise
        
        # Ensure no negative values for these columns
        if col in ['Rainfall_mm', 'Days_to_Harvest', 'Yield_tons_per_hectare']:
            expanded_df[col] = expanded_df[col].clip(lower=0)
    
    # Trim to exact target size
    expanded_df = expanded_df.head(target_size)
    
    return expanded_df

# Expand the balanced dataset to 50,000 records
expanded_df = expand_dataset(df_balanced, 50000)

# Verify the final dataset
print(f"Final dataset size: {len(expanded_df)} records")
print("\nFinal crop distribution:")
print(expanded_df['Crop'].value_counts())

# Save the final dataset
expanded_df.to_csv('crop_yield_50000.csv', index=False)
print("\n50,000 record dataset saved successfully!")

Final dataset size: 50000 records

Final crop distribution:
Crop
Maize      8503
Cotton     8343
Barley     8332
Rice       8308
Soybean    8288
Wheat      8226
Name: count, dtype: int64

50,000 record dataset saved successfully!
