In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report, f1_score


train = pd.read_csv("/kaggle/input/datasettt/train.csv")
test = pd.read_csv("/kaggle/input/datasettt/test.csv")
sample_sub = pd.read_csv("/kaggle/input/datasettt/sample_submission.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

train.head()


print(train.info())
print(train.isnull().sum())




target_col = "fruit_name"

X = train.drop(columns=[target_col])
y = train[target_col]

# Store test ID safely for final submission
test_id = test["id"]
X_test_original = test.drop(columns=["id"])

print("Features shape:", X.shape)
print("Target shape:", y.shape)


full = pd.concat([X, X_test_original], axis=0, ignore_index=True)

print("Combined full shape:", full.shape)



num_cols_full = full.select_dtypes(include=['int64', 'float64']).columns
cat_cols_full = full.select_dtypes(include=['object']).columns

print("Numeric columns:", list(num_cols_full))
print("Categorical columns:", list(cat_cols_full))

# Imputer for numeric columns: fill NaN with median
num_imputer = SimpleImputer(strategy="median")
full[num_cols_full] = num_imputer.fit_transform(full[num_cols_full])

# Imputer for categorical columns: fill NaN with most frequent (mode)
cat_imputer = SimpleImputer(strategy="most_frequent")
full[cat_cols_full] = cat_imputer.fit_transform(full[cat_cols_full])

# Check if any missing values left
print("\nMissing values after imputation:")
print(full.isnull().sum())


def cap_out(df,nums_cols):
    def capping(grp):
        grp=grp.copy()
        for c in nums_cols:
            q1=df[c].quantile(0.25)
            q3=df[c].quantile(0.75)
            iqr=q3-q1
            low=q1-1.5*iqr
            up=q3+1.5*iqr
            grp[c]=grp[c].clip(low,up)

    train_cap=cap_out(train,["fruit_name"], nums_cols)


full_encoded = pd.get_dummies(full, drop_first=True)

print("Shape after one-hot encoding:", full_encoded.shape)
full_encoded.head()


n_train = len(train)

X_prepared = full_encoded.iloc[:n_train, :]   # rows for train
X_test_prepared = full_encoded.iloc[n_train:, :]  # rows for test

print("Prepared train features shape:", X_prepared.shape)
print("Prepared test features shape:", X_test_prepared.shape)

# Feature scaling (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_prepared)
X_test_scaled = scaler.transform(X_test_prepared)




label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Original classes:", label_encoder.classes_)
print("Example encoded labels:", y_encoded[:10])


X_train, X_valid, y_train, y_valid = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print("Train shape:", X_train.shape)
print("Validation shape:", X_valid.shape)



rf=RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_valid)
print("Accuracy :", accuracy_score(y_valid,y_pred))

print("\n Classfication Report : ")
print(classification_report(y_valid,y_pred))
print("Confusion Matrix : ")
print(confusion_matrix(y_valid,y_pred))



final_rf=RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
final_rf.fit(X_scaled,y_encoded)


test_pred_enco=final_rf.predict(X_test_scaled)
test_pred_label=label_encoder.inverse_transform(test_pred_enco)

submission=pd.DataFrame({
    "id" : test_id,
    "Class" : test_pred_label
})
submission.head()


submission.to_csv("submission.csv", index=False)