In [None]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('../Train.csv')
test_df = pd.read_csv('../Test.csv')

# Preview the first few rows of each dataset
train_preview = train_df.head()
test_preview = test_df.head()

train_preview, test_preview


In [None]:
# Discard the 'Yield' column from the train dataset
train_df.drop('Yield', axis=1, inplace=True)

# Combine the datasets into one
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# Display the first few rows of the combined dataset to verify
combined_df.head()


In [None]:
# Calculate the percentage of missing values for each column
missing_values = combined_df.isnull().mean() * 100

# Display the percentage of missing values for each column
missing_values.sort_values(ascending=False)


In [None]:
from sklearn.impute import SimpleImputer

# Function to impute missing dates based on the median date within the same district or block
def impute_dates(dataframe, date_columns):
    for column in date_columns:
        # Convert to datetime
        dataframe[column] = pd.to_datetime(dataframe[column], errors='coerce')
        # Group by 'District' and 'Block' and then transform using the median date
        dataframe[column] = dataframe.groupby(['District', 'Block'])[column].transform(lambda x: x.fillna(x.median()))
    return dataframe

# Categorical columns with low missingness (<15%)
categorical_cols_low_miss = missing_values.index[(missing_values < 15) & (combined_df.dtypes == 'object')]

# Continuous numerical columns with low missingness (<15%)
numerical_cols_low_miss = missing_values.index[(missing_values < 15) & (combined_df.dtypes != 'object')]

# Date columns
date_columns = ['CropTillageDate', 'RcNursEstDate', 'Harv_date', 'Threshing_date']

# Impute missing dates
combined_df = impute_dates(combined_df, date_columns)

# Simple Imputer for categorical data
mode_imputer = SimpleImputer(strategy='most_frequent')
combined_df[categorical_cols_low_miss] = mode_imputer.fit_transform(combined_df[categorical_cols_low_miss])

# Simple Imputer for numerical data
mean_imputer = SimpleImputer(strategy='mean')
combined_df[numerical_cols_low_miss] = mean_imputer.fit_transform(combined_df[numerical_cols_low_miss])

# Check missing values again
new_missing_values = combined_df.isnull().mean() * 100

combined_df[date_columns].info(), combined_df[categorical_cols_low_miss].info(), combined_df[numerical_cols_low_miss].info(), new_missing_values.sort_values(ascending=False)


In [None]:
from sklearn.impute import KNNImputer

# Drop features with more than 70% missingness
threshold = 70
features_to_drop = missing_values.index[missing_values > threshold].tolist()
combined_df.drop(columns=features_to_drop, inplace=True)

# Impute numerical features with high missingness using k-NN
# Identifying numerical columns with high missingness (between 15% to 70%)
numerical_cols_high_miss = missing_values.index[(missing_values >= 15) & (missing_values <= 70) & (combined_df.dtypes != 'object')].tolist()

# Since k-NN imputer works only with numerical data, ensure to exclude any non-numerical columns
numerical_cols_for_knn = [col for col in numerical_cols_high_miss if combined_df[col].dtype in ['int64', 'float64']]

# k-NN imputer
knn_imputer = KNNImputer(n_neighbors=5)
combined_df[numerical_cols_for_knn] = knn_imputer.fit_transform(combined_df[numerical_cols_for_knn])

# For categorical variables, we impute missing values with the most frequent value within the same district or block
categorical_cols_high_miss = missing_values.index[(missing_values >= 15) & (missing_values <= 70) & (combined_df.dtypes == 'object')].tolist()

for column in categorical_cols_high_miss:
    combined_df[column] = combined_df.groupby(['District', 'Block'])[column].apply(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else "Unknown"))

# Check the dataset for any remaining missing values
final_missing_values = combined_df.isnull().mean() * 100

final_missing_values.sort_values(ascending=False), combined_df.info()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Univariate Analysis for Numerical Variables
numerical_vars = ['CultLand', 'CropCultLand', 'Acre']
categorical_vars = ['District', 'LandPreparationMethod']

# Summary statistics for numerical variables
summary_statistics = combined_df[numerical_vars].describe()

# Creating histograms for numerical variables
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))
for i, var in enumerate(numerical_vars):
    sns.histplot(combined_df[var], ax=axes[i], kde=True)
    axes[i].set_title(f'Distribution of {var}', fontsize=14)
plt.tight_layout()

# Creating box plots for numerical variables
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))
for i, var in enumerate(numerical_vars):
    sns.boxplot(y=combined_df[var], ax=axes[i])
    axes[i].set_title(f'Box Plot of {var}', fontsize=14)
plt.tight_layout()

# Univariate Analysis for Categorical Variables
# Frequency distribution for categorical variables
frequency_distributions = {}
for var in categorical_vars:
    frequency_distributions[var] = combined_df[var].value_counts()

# Creating bar charts for categorical variables
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 5))
for i, var in enumerate(categorical_vars):
    sns.barplot(x=frequency_distributions[var].values, y=frequency_distributions[var].index, ax=axes[i])
    axes[i].set_title(f'Frequency Distribution of {var}', fontsize=14)
    axes[i].set_xlabel('Frequency')
    axes[i].set_ylabel(var)
plt.tight_layout()

# Output the summary statistics and show the plots
summary_statistics, plt.show()


In [None]:
# Function to find outliers using the Interquartile Range (IQR) method
def find_outliers_IQR(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[feature] < (Q1 - 1.5 * IQR)) | (df[feature] > (Q3 + 1.5 * IQR))]
    return outliers

# Investigate outliers for each numerical variable
outliers_dict = {}
for var in numerical_vars:
    outliers_dict[var] = find_outliers_IQR(combined_df, var)

# Number of outliers for each variable
outliers_count = {var: len(outliers) for var, outliers in outliers_dict.items()}

# Proportion of data that is outlier for each variable
outliers_proportion = {var: (len(outliers) / len(combined_df)) * 100 for var, outliers in outliers_dict.items()}

outliers_count, outliers_proportion


In [None]:
import numpy as np

# Apply log transformation to the numerical variables with a small constant to avoid log(0)
# Using log1p which is equivalent to log(x+1) to handle zero values
for var in numerical_vars:
    combined_df[f'log_{var}'] = np.log1p(combined_df[var])

# Visualizing the transformed variables
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))
for i, var in enumerate(numerical_vars):
    sns.histplot(combined_df[f'log_{var}'], ax=axes[i], kde=True)
    axes[i].set_title(f'Log Transformed Distribution of {var}', fontsize=14)
plt.tight_layout()

# Creating box plots for the log-transformed numerical variables
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))
for i, var in enumerate(numerical_vars):
    sns.boxplot(y=combined_df[f'log_{var}'], ax=axes[i])
    axes[i].set_title(f'Box Plot of Log Transformed {var}', fontsize=14)
plt.tight_layout()

# Show the plots
plt.show()


In [None]:
# Iterate over columns and drop original columns if their log-transformed versions exist
columns_to_drop = []
for col in combined_df.columns:
    log_col = f'log_{col}'
    if log_col in combined_df.columns:
        columns_to_drop.append(col)

# Drop the identified columns
combined_df.drop(columns=columns_to_drop, axis=1, inplace=True)

# Display the first few rows of the updated dataset
combined_df.head()


In [None]:
# Check the number of unique categories for each categorical variable
unique_counts = combined_df[categorical_vars].nunique()

# One-hot encode 'LandPreparationMethod' as an example
# We'll check the number of unique values before deciding to one-hot encode
encoded_columns = pd.get_dummies(combined_df['LandPreparationMethod'], prefix='LPM')

# Output the unique counts for each categorical variable and the first few rows of the encoded example
unique_counts, encoded_columns.head()


In [None]:
# Frequency encoding for 'LandPreparationMethod'
frequency_encoding = combined_df['LandPreparationMethod'].value_counts(normalize=True)

# Map the frequencies to the original data
combined_df['LandPreparationMethod_freq'] = combined_df['LandPreparationMethod'].map(frequency_encoding)

# Show the first few entries of the frequency encoded column
combined_df[['LandPreparationMethod', 'LandPreparationMethod_freq']].head()


In [None]:
# Reload the Train.csv file to extract the 'Yield' feature
train_df_yield = pd.read_csv('../Train.csv', usecols=['ID', 'Yield'])

# Merge the 'Yield' feature back into the preprocessed dataset using 'ID'
combined_df_with_yield = combined_df.merge(train_df_yield, on='ID', how='left')

# Separate the entries without 'Yield' data into a new DataFrame
df_without_yield = combined_df_with_yield[combined_df_with_yield['Yield'].isnull()]
df_with_yield = combined_df_with_yield[combined_df_with_yield['Yield'].notnull()]

# Show the first few entries of each dataset to verify the merge
df_with_yield.head(), df_without_yield.head()


In [13]:
# save the preprocessed datasets to new csv files for modelling 
df_with_yield.to_csv('preprocessed_train.csv', index=False)
df_without_yield.to_csv('preprocessed_test.csv', index=False)

In [None]:
import numpy as np
import pandas as pd

# Load the datasets
data_with_yield = pd.read_csv('preprocessed_train.csv')
data_without_yield = pd.read_csv('preprocessed_test.csv')

# Mark the datasets before combining
data_with_yield['is_train'] = True
data_without_yield['is_train'] = False

# Combine the datasets
combined_data = pd.concat([data_with_yield, data_without_yield], ignore_index=True)

# Apply transformations on the combined dataset

# 1. Identifying and Handling Date Features
date_cols = []
for col in combined_data.columns:
    if combined_data[col].dtype == 'object':
        # Attempt to convert column to datetime
        temp_col = pd.to_datetime(combined_data[col], errors='coerce')
        # Check the number of successful conversions
        if not temp_col.isna().sum() / len(temp_col) > 0.5:  # Threshold for successful conversion
            date_cols.append(col)
            combined_data[col] = temp_col
            # Extracting date components
            combined_data[f'{col}_day'] = combined_data[col].dt.day
            combined_data[f'{col}_month'] = combined_data[col].dt.month
            combined_data[f'{col}_year'] = combined_data[col].dt.year
            # Calculate elapsed time
            reference_date = max(combined_data[col].max(), pd.Timestamp('now'))
            combined_data[f'DaysSince_{col}'] = (reference_date - combined_data[col]).dt.days


# 2. Transforming High Cardinality Categorical Features
high_cardinality_cols = ['LandPreparationMethod', 'SeedingSowingTransplanting', 'NursDetFactor', 'TransDetFactor']

def split_methods(df, column):
    if df[column].dtype == object:
        methods_series = df[column].str.split(' ')
        unique_methods = set(method for method_list in methods_series.dropna() for method in method_list)
        for method in unique_methods:
            df[f'{column}_{method}'] = methods_series.apply(lambda x: method in x if x is not None else False)
    return df.drop(columns=[column], errors='ignore')

# Applying the transformation to high cardinality columns
for col in high_cardinality_cols:
    combined_data = split_methods(combined_data, col)


# 3. One-hot Encoding for Moderate and Low Cardinality Categorical Features
categorical_cols = combined_data.select_dtypes(include=['object']).columns
categorical_cols = categorical_cols.drop('ID')  # Exclude 'ID' from one-hot encoding
combined_data = pd.get_dummies(combined_data, columns=categorical_cols, drop_first=True)

# 4. Normalizing Skewed Numerical Features (apply before separating, excluding 'Yield')
numerical_cols = combined_data.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = numerical_cols.drop(['Yield', 'ID'], errors='ignore') # Exclude 'Yield' and 'ID' from this transformation
numerical_skewness = combined_data[numerical_cols].skew()
highly_skewed_features = numerical_skewness[abs(numerical_skewness) > 1].index

for feature in highly_skewed_features:
    combined_data[f'log_{feature}'] = np.log(combined_data[feature] + 1)

# Splitting the datasets
data_with_yield = combined_data[combined_data['is_train']]
data_without_yield = combined_data[~combined_data['is_train']]
data_with_yield.drop('is_train', axis=1, inplace=True)
data_without_yield.drop('is_train', axis=1, inplace=True)

# Applying log transformation to the 'Yield' column
if 'Yield' in data_with_yield.columns:
    data_with_yield['log_Yield'] = np.log(data_with_yield['Yield'] + 1)

# Display the first few rows of the transformed datasets
print(data_with_yield.head())
print(data_without_yield.head())


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from kerastuner.tuners import RandomSearch


In [16]:
# drop 'ID' column from data_with_yield 
data_with_yield = data_with_yield.drop(columns=['ID'])

In [16]:
data_with_yield.head()

Unnamed: 0,ID,CropTillageDate,CropTillageDepth,RcNursEstDate,SeedlingsPerPit,TransplantingIrrigationHours,TransIrriCost,StandingWater,Ganaura,CropOrgFYM,...,log_Harv_hand_rent,log_Residue_length,log_Residue_perc,log_log_Acre,log_Harv_date_month,log_Harv_date_year,log_DaysSince_Harv_date,log_Threshing_date_month,log_Threshing_date_year,log_Yield
0,ID_GTFAC7PEVWQ9,2022-07-20,5.0,2022-06-27,2.0,5.0,200.0,2.0,21.0,3.6,...,6.32452,3.433987,3.713572,0.240538,2.484907,7.612337,5.891644,2.484907,7.612337,6.398595
1,ID_TK40ARLSPOKS,2022-07-18,5.0,2022-06-20,2.0,5.0,125.0,3.0,22.8,8.8,...,1.386294,3.218876,2.397895,0.240538,2.484907,7.612337,5.866468,2.564949,7.612337,6.398595
2,ID_1FJY2CRIMLZZ,2022-06-30,6.0,2022-06-20,2.0,4.0,80.0,2.0,1.0,1.0,...,6.175867,3.433987,2.397895,0.129404,2.564949,7.612337,5.817111,0.693147,7.612831,5.420535
3,ID_I3IPXS4DB7NE,2022-06-16,6.0,2022-06-17,2.0,8.100613,216.0,3.252529,1.0,1.2,...,5.484797,3.295837,2.397895,0.18288,2.564949,7.612337,5.846439,2.564949,7.612337,6.150603
4,ID_4T8YQWXWHB4A,2022-07-19,4.0,2022-06-21,2.0,9.0,300.0,2.0,21.2,2.0,...,6.32452,3.218876,3.713572,0.325275,2.484907,7.612337,5.852202,2.564949,7.612337,6.311735


In [None]:
# Iterate over columns and drop original columns if their log-transformed versions exist
columns_to_drop = []
for col in data_with_yield.columns:
    log_col = f'log_{col}'
    if log_col in data_with_yield.columns:
        columns_to_drop.append(col)

# Drop the identified columns
data_with_yield.drop(columns=columns_to_drop, axis=1, inplace=True)

# Display the first few rows of the updated dataset
data_with_yield.head()


In [None]:
data_with_yield.head()

In [None]:
# Iterate over columns and drop original columns if their log-transformed versions exist
columns_to_drop = []
for col in data_without_yield.columns:
    log_col = f'log_{col}'
    if log_col in data_without_yield.columns:
        columns_to_drop.append(col)

# Drop the identified columns
data_without_yield.drop(columns=columns_to_drop, axis=1, inplace=True)

# Display the first few rows of the updated dataset
data_without_yield.head()


In [20]:
# save the preprocessed datasets to new csv files for modelling
data_with_yield.to_csv('preprocessed+_train.csv', index=False)
data_without_yield.to_csv('preprocessed+_test.csv', index=False)

In [21]:
# Ensure that only numerical columns are used for scaling
numerical_cols = data_with_yield.select_dtypes(include=['float64', 'int64']).columns

# Filter out non-numerical columns
X_numerical = data_with_yield[numerical_cols]

# Separate features and target variable
X = X_numerical.drop('log_Yield', axis=1)
y = data_with_yield['log_Yield'].values

# Normalizing the data - only apply scaler to numerical columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into train and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the model building function for tuning
def build_model(hp):
    model = Sequential()
    model.add(Dense(hp.Int('input_units', min_value=64, max_value=512, step=64), activation='relu', input_shape=(X_train_full.shape[1],)))
    
    for i in range(hp.Int('n_layers', 1, 5)):
        model.add(Dense(hp.Int(f'dense_{i}_units', min_value=32, max_value=256, step=32), activation='relu'))
        model.add(Dropout(hp.Float(f'dropout_{i}_rate', min_value=0, max_value=0.5, step=0.1)))
    
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer=tf.keras.optimizers.Adam(hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
                  loss='mean_squared_error', 
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

In [None]:
from kerastuner import Objective

# Initialize and configure the tuner
tuner = RandomSearch(
    build_model,
    objective=Objective('val_loss', direction='min'),
    max_trials=10,
    executions_per_trial=3,
    directory='model_tuning',
    project_name='yield_prediction'
)

tuner.search_space_summary()


In [23]:
# Start the hyperparameter tuning process
tuner.search(X_train_full, y_train_full, epochs=50, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss', patience=10)])

Trial 10 Complete [00h 00m 33s]
val_loss: 0.13474103808403015

Best val_loss So Far: 0.08985296885172527
Total elapsed time: 00h 03m 37s


In [None]:
# Retrieve the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Define the K-Fold cross-validator
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize list to store RMSE for each fold
rmse_scores = []

# K-Fold Cross-Validation
for train_index, val_index in kf.split(X_train_full):
    # Split data
    X_train, X_val = X_train_full[train_index], X_train_full[val_index]
    y_train, y_val = y_train_full[train_index], y_train_full[val_index]

    # Build the model with the best hyperparameters
    model = tuner.hypermodel.build(best_hps)

    # Train the model
    model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val), verbose=5)

    # Evaluate the model on the validation set
    val_pred = model.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    rmse_scores.append(val_rmse)

# Calculate the average RMSE across all folds
average_rmse = np.mean(rmse_scores)
print("Average RMSE across all folds:", average_rmse)


In [None]:
# Ensure that the 'Yield' column is not in the dataset before scaling
if 'Yield' in data_without_yield.columns:
    data_without_yield = data_without_yield.drop(columns=['Yield'])

# Select only numerical columns for prediction
data_without_yield_numerical = data_without_yield.select_dtypes(include=['float64', 'int64'])

# Normalize the data - only apply scaler to numerical columns
data_without_yield_numerical = scaler.transform(data_without_yield_numerical)

# Make predictions using the trained model
predicted_yield = model.predict(data_without_yield_numerical)

# Convert predictions to the original scale if they were log-transformed
predicted_yield = np.exp(predicted_yield) - 1  # If 'log_Yield' was used as target

# Ensure predicted 'Yield' is of type float
predicted_yield = predicted_yield.flatten().astype(float)

# Create a DataFrame with IDs and corresponding predictions
predictions_df = pd.DataFrame({
    'ID': ids,
    'Yield': predicted_yield.flatten()  # Flatten to convert from 2D to 1D array
})

# Restricting 'Yield' to two decimal places
predictions_df['Yield'] = predictions_df['Yield'].round(2)

# Save the predictions to a CSV file
predictions_df.to_csv('subtk7.csv', index=False)

# Display the first few rows of the predictions DataFrame
predictions_df.head()


In [None]:
import catboost as cb
from sklearn.model_selection import train_test_split

# Assuming data_with_yield is a pandas DataFrame with the target column 'log_Yield'

# Separate the features and the target variable
X = data_with_yield.drop(columns=['log_Yield'])  # Drop 'ID' if it's not a feature
y = data_with_yield['log_Yield']

# Split the data into a training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize CatBoostRegressor
catboost_model = cb.CatBoostRegressor(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=42,
    verbose=100
)

# Fit the model
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

# Save the model to a file
catboost_model.save_model('catboost_model.cbm')

# Make predictions on the validation set
predictions = catboost_model.predict(X_val)

# Calculate RMSE
val_rmse = np.sqrt(mean_squared_error(y_val, predictions))
print("Validation RMSE:", val_rmse)


In [None]:
# Prepare the data_without_yield dataset by selecting the same features used for training
# Exclude the 'ID' column if it's not used as a feature
X_to_predict = data_without_yield.drop(columns=['ID'])

# Make predictions using the CatBoost model
predictions_without_yield = catboost_model.predict(X_to_predict)

# Reverse the log transformation if you used log1p during training
predictions_without_yield = np.expm1(predictions_without_yield)

# Round predictions to two decimal places
predictions_without_yield = np.round(predictions_without_yield, 2)

# Create a DataFrame with IDs and corresponding predictions
predictions_df = pd.DataFrame({
    'ID': data_without_yield['ID'],  # Make sure 'ID' column exists in the data_without_yield
    'Yield': predictions_without_yield
})

# Save the predictions to a CSV file
predictions_df.to_csv('catboost_tk7.csv', index=False)

# Display the first few rows of the predictions DataFrame
predictions_df.head()