In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load data
train = pd.read_csv("stores_sales_data.csv")

print("Shape:", train.shape)
print("\nColumns:", train.columns.tolist())
print("\nData types:\n", train.dtypes)
print("\nMissing values:\n", train.isnull().sum())
print("\nCategorical values (Item_Fat_Content):", train['Item_Fat_Content'].unique())

# Impute numerical missing values with mean (Item_Weight)
train['Item_Weight'] = train['Item_Weight'].fillna(train['Item_Weight'].mean())

# Impute categorical missing values with mode (Outlet_Size)
train['Outlet_Size'] = train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0])

# Standardize Item_Fat_Content values
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace({
    'LF': 'Low Fat',
    'low fat': 'Low Fat',
    'reg': 'Regular'
})

sns.histplot(train['Item_Outlet_Sales'], kde=True)
plt.title("Distribution of Item Outlet Sales")
plt.show()

corr = train[['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# Average Sales by Outlet Type
sns.barplot(x='Outlet_Type', y='Item_Outlet_Sales', data=train)
plt.title("Average Sales by Outlet Type")
plt.xticks(rotation=45)
plt.show()

# One-hot encoding
train = pd.get_dummies(train, columns=['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'], drop_first=True)

# Outlet Age (assuming current year is 2013 for the dataset)
train['Outlet_Age'] = 2013 - train['Outlet_Establishment_Year']

# Item Visibility Ratio (visibility vs. average visibility for the product)
train['Visibility_Ratio'] = train['Item_Visibility'] / train.groupby('Item_Identifier')['Item_Visibility'].transform('mean')

train = train.drop(['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'], axis=1)

X = train.drop('Item_Outlet_Sales', axis=1)
y = train['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

test = pd.read_csv("BigMart_Test.csv")

# Apply the same preprocessing steps as training data
test['Item_Weight'] = test['Item_Weight'].fillna(train['Item_Weight'].mean())
test['Outlet_Size'] = test['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0])
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'})
test['Outlet_Age'] = 2013 - test['Outlet_Establishment_Year']
test['Visibility_Ratio'] = test['Item_Visibility'] / test.groupby('Item_Identifier')['Item_Visibility'].transform('mean')
test = pd.get_dummies(test, columns=['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])

# Align columns with training data
missing_cols = set(X_train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0
test = test[X_train.columns]

test_predictions = best_model.predict(test)
test['Predicted_Sales'] = test_predictions

# Save results
test[['Item_Identifier', 'Outlet_Identifier', 'Predicted_Sales']].to_csv("new_data.csv", index=False)


