In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score

In [84]:
df=pd.read_csv("Carseats.csv")

In [85]:
df.isnull().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      1
Age            0
Education      0
Urban          0
US             3
dtype: int64

In [86]:
import plotly.express as px
fig = px.histogram(df, x='Population', y='US', title='Population vs US')
fig.show()

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    object 
 6   ShelveLoc    399 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    object 
 9   Urban        400 non-null    object 
 10  US           397 non-null    object 
dtypes: float64(1), int64(5), object(5)
memory usage: 34.5+ KB


In [88]:
missing_columns = df.columns[df.isnull().sum() > 0]

for col in missing_columns:
    if df[col].dtype == 'object': 
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:  
        df[col].fillna(df[col].mean(), inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [89]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["US"] = le.fit_transform(df["US"])
print(df)

     Sales  CompPrice  Income  Advertising  ...  Age Education Urban  US
0     9.50        138      73           11  ...   42        17   Yes   1
1    11.22        111      48           16  ...   65        10   Yes   1
2    10.06        113      35           10  ...   59        12   Yes   1
3     7.40        117     100            4  ...   55        14   Yes   1
4     4.15        141      64            3  ...   38        13   Yes   0
..     ...        ...     ...          ...  ...  ...       ...   ...  ..
395  12.57        138     108           17  ...   33        14   Yes   1
396   6.14        139      23            3  ...   55        11    No   1
397   7.41        162      26           12  ...   40        18   Yes   1
398   5.94        100      79            7  ...   50        12   Yes   1
399   9.71        134      37            0  ...   49        16   Yes   1

[400 rows x 11 columns]


In [90]:
df.isnull().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [91]:
df.drop("ShelveLoc", axis=1,inplace=True)
df.drop("Education", axis=1,inplace=True)
df.drop("Urban", axis=1,inplace=True)
df.drop("Price", axis=1,inplace=True)

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Age          400 non-null    int64  
 6   US           400 non-null    int64  
dtypes: float64(1), int64(6)
memory usage: 22.0 KB


In [93]:
import klib

In [94]:
adjusted_df = klib.convert_datatypes(df)

In [95]:
adjusted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float32
 1   CompPrice    400 non-null    int16  
 2   Income       400 non-null    int8   
 3   Advertising  400 non-null    int8   
 4   Population   400 non-null    int16  
 5   Age          400 non-null    int8   
 6   US           400 non-null    int8   
dtypes: float32(1), int16(2), int8(4)
memory usage: 4.8 KB


In [96]:

df_cleaned = klib.data_cleaning(df, cat_threshold=0.8, drop_threshold_cols=0.4)

Shape of cleaned data: (400, 7) - Remaining NAs: 0


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.02 MB (-100.0%)



In [97]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sales        400 non-null    float32
 1   comp_price   400 non-null    int16  
 2   income       400 non-null    int8   
 3   advertising  400 non-null    int8   
 4   population   400 non-null    int16  
 5   age          400 non-null    int8   
 6   us           400 non-null    int8   
dtypes: float32(1), int16(2), int8(4)
memory usage: 4.8 KB


In [98]:
X = df.drop(columns=["US"])  
y = df["US"] 

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [100]:
# Base models
base_models = [
    ('decision_tree', DecisionTreeClassifier(random_state=42)),
    ('random_forest', RandomForestClassifier(random_state=42, n_estimators=100)),
    ('lg',LogisticRegression(max_iter=1000) )
]

In [101]:
# 1. Stacking
stacking_model = StackingClassifier(estimators=base_models, final_estimator=KNeighborsClassifier(), cv=5)
stacking_model.fit(X_train, y_train)
stacking_preds = stacking_model.predict(X_test)

In [102]:
# Stacking accuracy
stacking_accuracy = accuracy_score(y_test, stacking_preds)
print(f"Stacking Accuracy: {stacking_accuracy:.2f}")

Stacking Accuracy: 0.89


In [103]:
# 2. Blending (Manual Implementation)

X_train_blend, X_val, y_train_blend, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [104]:
#Train base models
for name, model in base_models:
    model.fit(X_train_blend, y_train_blend)

In [105]:
val_preds = np.column_stack([model.predict(X_val) for _, model in base_models])

In [106]:
meta_model = LogisticRegression()
meta_model.fit(val_preds, y_val)

In [107]:
test_preds = np.column_stack([model.predict(X_test) for _, model in base_models])
blending_preds = meta_model.predict(test_preds)

In [108]:
#Blending accuracy
blending_accuracy = accuracy_score(y_test, blending_preds)
print(f"Blending Accuracy: {blending_accuracy:.2f}")

Blending Accuracy: 0.91


In [109]:
print("\nBase Model Accuracies:")
for name, model in base_models:
    model.fit(X_train, y_train)
    base_preds = model.predict(X_test)
    print(f"{name.capitalize()} Accuracy: {accuracy_score(y_test, base_preds):.2f}")


Base Model Accuracies:
Decision_tree Accuracy: 0.82
Random_forest Accuracy: 0.91
Lg Accuracy: 0.88
