# Phase 4: Statistical Analysis or Modeling

In [1]:
# Regression Modeling Code (Linear Regression)
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("updated_customer_data_with_cac.csv")

# 1. Features & Target
X = df[['Marketing_Spend', 'Marketing_Channel']]
y = df['New_Customers']

# 2. Preprocessing: One-hot encode categorical feature
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), ['Marketing_Channel'])
    ],
    remainder='passthrough'  # Keep other features as-is
)

# 3. Create pipeline with Linear Regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train the model
pipeline.fit(X_train, y_train)

# 6. Make predictions
y_pred = pipeline.predict(X_test)

# 7. Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")



Model Evaluation:
Mean Squared Error (MSE): 145.40
R-squared (R²): -0.03
