In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
data = pd.read_csv("car_purchasing.csv", encoding="ISO-8859-1")


In [None]:
print(data.info())
print(data.describe())
print(data.head())
print(data.tail())

In [None]:
# Handle missing values
data.dropna(inplace=True)

In [None]:
# Detect and handle outliers
numeric_cols = data.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

In [None]:
print(data.columns)


In [None]:
# Feature selection
X = data.drop(columns=['car purchase amount'])  # Assuming 'Sales' is the target variable
y = data['car purchase amount']


In [None]:
# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Preprocessing pipeline
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [None]:
# Train model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [None]:
# Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R2 Score: {r2}")


In [None]:
# Feature importance
feature_importances = model.named_steps['regressor'].feature_importances_
feature_names = list(numeric_features) + list(model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out())
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title("Feature Importance")
plt.show()


In [None]:
sample_data = pd.DataFrame({
    'customer name':['Quin Smith'],
    'country': ['Nicaragua'], 
    'gender': [0], 
    'age': [44], 
    'annual Salary': [37336], 
    'credit card debt': [10218], 
    'net worth': [430907],
    'customer e-mail': ['nulla@ipsum.edu']
})


In [None]:
predicted_value = model.predict(sample_data)
print(f"Predicted Car Purchase Amount: ${predicted_value[0]:,.2f}")


In [None]:
sample_data_2 = pd.DataFrame({
    'customer name':['Zelena Buyers'],
    'country': ['Angola'], 
    'gender': [1], 
    'age': [48], 
    'annual Salary': [64347.34531], 
    'credit card debt': [10905.36628], 
    'net worth': [307226.0977],
    'customer e-mail': ['auctor.non@sapien.co.uk']
})


In [None]:
predicted_value = model.predict(sample_data_2)
print(f"Predicted Car Purchase Amount: ${predicted_value[0]:,.2f}")