In [86]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score


In [87]:
df=pd.read_csv("Carseats.csv")

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    object 
 6   ShelveLoc    399 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    object 
 9   Urban        400 non-null    object 
 10  US           397 non-null    object 
dtypes: float64(1), int64(5), object(5)
memory usage: 34.5+ KB


In [89]:
df["US"].value_counts()

US
Yes    256
No     141
Name: count, dtype: int64

In [90]:
df["US"]=df["US"].map({"Yes":1,"No":0})

In [91]:
df["US"].value_counts()

US
1.0    256
0.0    141
Name: count, dtype: int64

In [92]:
missing_cols=df.columns[df.isnull().sum()>0]

for col in missing_cols:
    if df [col].dtype=="object":
     df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    object 
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    object 
 9   Urban        400 non-null    object 
 10  US           400 non-null    float64
dtypes: float64(2), int64(5), object(4)
memory usage: 34.5+ KB


In [94]:
df["US"]=df["US"].astype(int)

In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    object 
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    object 
 9   Urban        400 non-null    object 
 10  US           400 non-null    int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 34.5+ KB


In [96]:
categorical_col=df.select_dtypes(include=["object","category"]).columns

In [97]:
categorical_col

Index(['Price', 'ShelveLoc', 'Education', 'Urban'], dtype='object')

In [98]:
print("Unique values 'Price' before cleaning")
print(df["Price"].unique())

Unique values 'Price' before cleaning
['120' '83' '80' '97' '128' '72' '108' '124' '100' '94' '136' '86' '118'
 '144' '110' '131' '68' 'Medium' '109' '138' 'Bad' '82' '107' '102' '89'
 '137' '96' '126' '24' '134' '95' '135' '70' '98' '149' '129' '119' '154'
 '84' '117' '103' '114' '123' '133' '101' '104' '91' '115' '99' '150'
 '116' '92' '145' '90' '79' '139' '121' '112' '111' '125' '148' '132'
 '127' '106' '151' '87' '155' '49' '147' '77' '159' '69' '157' '160' '141'
 '191' '93' '55' '185' '122' '81' '140' '173' '146' '130' '64' '105' '163'
 '88' '156' '166' '63' '158' '113' '74' '54' '171' '152' '143' '164' '162'
 '53' '78']


In [99]:
df["Price"]=pd.to_numeric(df["Price"],errors="coerce")

In [100]:
if df ["Price"].isnull().sum():
    print(f"Found{df["Price"].isnull().sum()} non-numeric or missing values in 'Price'.")

    df["Price"].fillna(df["Price"].mean(),inplace=True)


print ("nAfter cleaning")
print (df["Price"].head())
print (df.info())

Found2 non-numeric or missing values in 'Price'.
nAfter cleaning
0    120.0
1     83.0
2     80.0
3     97.0
4    128.0
Name: Price, dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    float64
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    object 
 9   Urban        400 non-null    object 
 10  US           400 non-null    int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 34.5+ KB
None


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Price"].fillna(df["Price"].mean(),inplace=True)


In [101]:
categorical_col=df.select_dtypes(include=["object","category"]).columns

In [102]:
categorical_col

Index(['ShelveLoc', 'Education', 'Urban'], dtype='object')

In [103]:
cardinality=df[categorical_col].nunique()

In [104]:
cardinality

ShelveLoc     5
Education    10
Urban         2
dtype: int64

In [105]:
df["Education"].value_counts()

Education
17     49
10     48
11     48
12     47
16     47
13     43
14     40
18     39
15     36
Yes     3
Name: count, dtype: int64

In [106]:
df["Education"]=pd.to_numeric(df["Education"],errors="coerce")

In [107]:
df.info("Education")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    float64
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    397 non-null    float64
 9   Urban        400 non-null    object 
 10  US           400 non-null    int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 34.5+ KB


In [108]:
df['Education'] = df['Education'].fillna(df['Education'].mean())


In [109]:
if df ["Education"].isnull().sum():
    print(f"Found{df["Education"].isnull().sum()} non-numeric or missing values in 'Education'.")

    df["Price"].fillna(df["Education"].mean(),inplace=True)


print ("nAfter cleaning")
print (df["Education"].head())
print (df.info())

nAfter cleaning
0    17.0
1    10.0
2    12.0
3    14.0
4    13.0
Name: Education, dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    float64
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    float64
 9   Urban        400 non-null    object 
 10  US           400 non-null    int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 34.5+ KB
None


In [110]:
df["Education"].value_counts()

Education
17.000000    49
10.000000    48
11.000000    48
12.000000    47
16.000000    47
13.000000    43
14.000000    40
18.000000    39
15.000000    36
13.899244     3
Name: count, dtype: int64

In [111]:
label_encoder=LabelEncoder()
df["Education"]=label_encoder.fit_transform(df["Education"])

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    float64
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 34.5+ KB


# One Hot Encoder

In [114]:
categorical_col=df.select_dtypes(include=["object","category"]).columns

In [115]:
categorical_col

Index(['ShelveLoc', 'Urban'], dtype='object')

In [116]:
cardinality=df[categorical_col].nunique()

In [117]:
cardinality

ShelveLoc    5
Urban        2
dtype: int64

In [119]:
encoded_df=pd.get_dummies(df[categorical_col],drop_first=False)

In [120]:
encoded_df

Unnamed: 0,ShelveLoc_42,ShelveLoc_69,ShelveLoc_Bad,ShelveLoc_Good,ShelveLoc_Medium,Urban_No,Urban_Yes
0,False,False,True,False,False,False,True
1,False,False,False,True,False,False,True
2,False,False,False,False,True,False,True
3,False,False,False,False,True,False,True
4,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...
395,False,False,False,True,False,False,True
396,False,False,False,False,True,True,False
397,False,False,False,False,True,False,True
398,False,False,True,False,False,False,True


In [121]:
df=df.drop(columns=categorical_col)

In [122]:
df=pd.concat([df,encoded_df],axis=1)

In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Sales             400 non-null    int64
 1   CompPrice         400 non-null    int64
 2   Income            400 non-null    int64
 3   Advertising       400 non-null    int64
 4   Population        400 non-null    int64
 5   Price             400 non-null    int64
 6   Age               400 non-null    int64
 7   Education         400 non-null    int64
 8   US                400 non-null    int64
 9   ShelveLoc_42      400 non-null    int64
 10  ShelveLoc_69      400 non-null    int64
 11  ShelveLoc_Bad     400 non-null    int64
 12  ShelveLoc_Good    400 non-null    int64
 13  ShelveLoc_Medium  400 non-null    int64
 14  Urban_No          400 non-null    int64
 15  Urban_Yes         400 non-null    int64
dtypes: int64(16)
memory usage: 50.1 KB


In [124]:
df=df.astype(int)