In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler

In [2]:
path = "files_for_lab/csv_files/marketing_customer_analysis.csv"
data = pd.read_csv(path)

import warnings

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [3]:

X = data.drop('Response', axis=1)
y = data['Response']

In [4]:
scaler = StandardScaler()

numerical_columns = X.select_dtypes(include=['float64']).columns

# Fit and transform the data
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

In [5]:

print(X[numerical_columns].head())

   Customer Lifetime Value  Total Claim Amount
0                -0.762878           -0.169640
1                -0.149245            2.400737
2                 0.710636            0.455734
3                -0.052263            0.329769
4                -0.755575           -1.018843


In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


categorical_columns = X.select_dtypes(include=['object']).columns


encoder = OneHotEncoder(drop='first') 
encoded = encoder.fit_transform(X[categorical_columns]).toarray()
encoded_columns = encoder.get_feature_names_out(categorical_columns)


X_encoded = pd.DataFrame(encoded, columns=encoded_columns)

In [7]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
y_encoded = label_encoder.fit_transform(y)

# Now y_encoded contains 0 and 1 instead of 'No' and 'Yes'

In [8]:
# Reset index for concatenation
X = X.reset_index(drop=True)
X_encoded = X_encoded.reset_index(drop=True)

# Concatenate numerical and encoded categorical data
X_final = pd.concat([X[numerical_columns], X_encoded], axis=1)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X_final, y_encoded, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train_encoded)

# Predictions
y_pred = model.predict(X_test)

# Since regression outputs continuous values, you might want to round or threshold these values if you need binary classification
y_pred_rounded = np.round(y_pred)

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

r2 = r2_score(y_test_encoded, y_pred_rounded)
print(f'R2: {r2}')

# Mean Squared Error
mse = mean_squared_error(y_test_encoded, y_pred_rounded)
print(f'MSE: {mse}')

# Root Mean Squared Error
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

# Mean Absolute Error
mae = mean_absolute_error(y_test_encoded, y_pred_rounded)
print(f'MAE: {mae}')

R2: -0.06920327725142439
MSE: 0.1330049261083744
RMSE: 0.3646984043128985
MAE: 0.1330049261083744
