In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

n_rows = 1000

# 50 unique categories
categories = [f"category_{i}" for i in range(1, 51)]

df = pd.DataFrame({
    "user_type": np.random.choice(categories, size=n_rows),   # object feature (50 attributes)
    "age": np.random.randint(18, 65, size=n_rows),
    "salary": np.random.randint(30000, 150000, size=n_rows),
    "experience_years": np.random.randint(0, 40, size=n_rows),
    "score": np.round(np.random.uniform(0, 100, size=n_rows), 2),
    "target": np.random.choice([0, 1], size=n_rows, p=[0.7, 0.3])
})

# Inject some missing values (for realism)
for col in ["age", "salary", "score"]:
    df.loc[df.sample(frac=0.08).index, col] = np.nan

df.head()


Unnamed: 0,user_type,age,salary,experience_years,score,target
0,category_39,50.0,121194.0,15,4.17,0
1,category_29,39.0,144306.0,7,39.76,0
2,category_15,,90315.0,0,76.23,0
3,category_43,23.0,109561.0,4,,0
4,category_8,23.0,38946.0,2,,0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [44]:
# dataset = pd.read_csv('shopping_behavior_updated.csv')

In [45]:
# df =  pd.DataFrame(dataset)
# df.head()

In [3]:
columns = df.columns

In [4]:
columns

Index(['user_type', 'age', 'salary', 'experience_years', 'score', 'target'], dtype='object')

In [5]:
for col in columns:
    dtype = df[col].dtype
    print(dtype)

object
float64
float64
int32
float64
int64


### Filtering the Columns

In [6]:
filter_fun = lambda x : df[x].dtype != 'object' 
df2 = df[df.columns[df.columns.map(filter_fun)]]


In [7]:
df2

Unnamed: 0,age,salary,experience_years,score,target
0,50.0,121194.0,15,4.17,0
1,39.0,144306.0,7,39.76,0
2,,90315.0,0,76.23,0
3,23.0,109561.0,4,,0
4,23.0,38946.0,2,,0
...,...,...,...,...,...
995,36.0,39496.0,11,74.94,0
996,39.0,86447.0,23,43.11,0
997,54.0,131604.0,28,8.75,0
998,49.0,59806.0,24,28.28,0


### Handling outliers

In [17]:
hadling_outliers(df2)

Unnamed: 0,age,salary,experience_years,score,target
0,50.0,121194.0,15,4.17,0
1,39.0,144306.0,7,39.76,0
5,21.0,124242.0,0,21.98,0
6,47.0,75500.0,3,13.15,0
7,28.0,134540.0,39,85.71,0
...,...,...,...,...,...
994,35.0,134555.0,16,31.80,0
995,36.0,39496.0,11,74.94,0
996,39.0,86447.0,23,43.11,0
997,54.0,131604.0,28,8.75,0


### Handling Null Values

In [8]:
df.shape

(1000, 6)

In [9]:
df2.shape

(1000, 5)

In [10]:
df2.isnull().sum()

age                 80
salary              80
experience_years     0
score               80
target               0
dtype: int64

In [11]:
def check_amount_of_missing_values(col):
    missing_count = df[col].isnull().sum()
    total_count = len(df)
    missing_percentage = (missing_count / total_count) * 100

    return missing_percentage


In [77]:
res = check_amount_of_missing_values('salary') / 100

if res > np.float64(0.05) and res < np.float64(0.3):
    print("Missing values are less than 30 %")
elif res > np.float64(0.3):
    print("Missing values are more than 30 %")
else:
    print("Missing values are less than 5 %")


Missing values are less than 30 %


In [12]:
def handling_missing_values(df):
    for col in df.columns:
        res = check_amount_of_missing_values(col) / 100
    # UNRELIABLE
        if res > np.float64(0.3):
            print(col,": The Dataset is Unreliable,missing percentage: ",res)
    # IMPUTATION   

        elif res > np.float64(0.05) and res < np.float64(0.3):
            print(col,": Imputation is better,missing percentage: ",res)
        
        elif res > np.float64(0.0) and res < np.float64(0.05):
            print(col,": Imputation is better,missing percentage: ",res)
        
        else:
            print("The feature ",col," is perfect")
        


In [13]:
handling_missing_values(df2)

age : Imputation is better,missing percentage:  0.08
salary : Imputation is better,missing percentage:  0.08
The feature  experience_years  is perfect
score : Imputation is better,missing percentage:  0.08
The feature  target  is perfect


In [91]:
df2.isnull().sum()

age                 80
salary              80
experience_years     0
score               80
target               0
dtype: int64

Technique to handle missing values