<a href="https://colab.research.google.com/github/Juhi2504/iris-flower-classification/blob/main/Untitled31.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# ================================
# FUTURE SALES PREDICTION PROJECT
# ================================

# 1. Import Libraries
import pandas as pd
import numpy as np
import os
import zipfile

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 2. Load Dataset
# Dataset must contain: Date, Advertising_Spend, Platform, Target_Segment, Sales

# --- Fix for FileNotFoundError: Unzip archive and load data ---
zip_path = "/content/archive (1).zip"
extract_path = "/content/sales_data_extracted/"

# Create the extraction directory if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Unzip the file and find the CSV path
csv_file_in_zip = None
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # Extract all contents
    zip_ref.extractall(extract_path)

    # Try to find the CSV file within the extracted contents
    for filename in zip_ref.namelist():
        if filename.endswith('Advertising.csv'): # Changed from sales_data.csv to Advertising.csv
            csv_file_in_zip = filename
            break

if csv_file_in_zip:
    data = pd.read_csv(os.path.join(extract_path, csv_file_in_zip))
else:
    raise FileNotFoundError("Advertising.csv not found within the zip file's contents.") # Updated error message
# ---------------------------------------------------------------

print("Dataset Preview:")
print(data.head())

# 3. Data Cleaning
data.drop_duplicates(inplace=True)
data.ffill(inplace=True) # Fixed FutureWarning: changed fillna(method="ffill") to ffill()

# Original script expected 'Date', 'Platform', 'Target_Segment' columns which are not in Advertising.csv.
# Removing steps related to these non-existent columns.

# 4. Feature Engineering (No time-based features as 'Date' column is missing)
# 5. Encode Categorical Features (No 'Platform' or 'Target_Segment' columns)

# 6. Feature Selection - Using available advertising spend columns
X = data[[
    "TV",
    "Radio",
    "Newspaper"
    ]]
y = data["Sales"]

# 7. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 8. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
    )

# 9. Train Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# 10. Model Evaluation
y_pred = model.predict(X_test)

print("\nModel Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))

# 11. Predict Future Sales - Updated to match new features
future_data = pd.DataFrame({
    "TV": [250],
    "Radio": [40],
    "Newspaper": [70]
    }) # Example values for TV, Radio, Newspaper

future_scaled = scaler.transform(future_data)
future_sales = model.predict(future_scaled)

print("\nPredicted Future Sales:", round(future_sales[0], 2))


Dataset Preview:
   Unnamed: 0     TV  Radio  Newspaper  Sales
0           1  230.1   37.8       69.2   22.1
1           2   44.5   39.3       45.1   10.4
2           3   17.2   45.9       69.3    9.3
3           4  151.5   41.3       58.5   18.5
4           5  180.8   10.8       58.4   12.9

Model Performance:
MAE: 1.4607567168117601
MSE: 3.1740973539761064
RMSE: 1.7815996615334508
R2 Score: 0.8994380241009119

Predicted Future Sales: 21.92
