<a href="https://colab.research.google.com/github/Mehaboob999/notebook-rough/blob/main/04_profit_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Load dataset
df = pd.read_csv('/content/cleaned_sales.csv')
df.head()


In [None]:
# Drop any missing values if present
df.dropna(inplace=True)

# Convert categorical variables to numeric
cat_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Confirm preprocessing
df.info()


In [None]:
# Choose features and target
X = df.drop(['profit', 'order_number', 'order_date'], axis=1)  # drop unnecessary columns
y = df['profit']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

# Metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)


In [None]:
importances = model.feature_importances_
feature_names = X.columns

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_names)
plt.title('Feature Importance')
plt.show()


In [None]:
pred_df = X_test.copy()
pred_df['actual_profit'] = y_test
pred_df['predicted_profit'] = y_pred
pred_df.to_csv('../data/profit_predictions.csv', index=False)

print("✅ profit_predictions.csv saved!")
