In [1]:
import pandas as pd
import numpy as np
from scipy import stats
data = {
    "ID": [1,2,3,4,5,6,7,8,9,10],
    "Age": [25,37,np.nan,29,42,55,23,29,np.nan,45],
    "Gender": ["Male","Female","Male","Female","Male","Female","Male","Female","Male","Female"],
    "Income": [50000,62000,45000,54000,np.nan,100000,52000,70000,200000,85000],
    "Occupation": ["Engineer","Data Scientist",np.nan,"Product Manager","Sales",
                   "Executive",np.nan,"Product","Manager",np.nan],
    "Years_Employed": [2,5,3,np.nan,np.nan,30,1,6,15,np.nan],
    "Satisfaction_Level": [0.8,np.nan,0.6,0.7,0.5,0.9,0.4,0.7,0.9,np.nan],
    "Purchase_History": ["High","Medium","Low","High","Low","High","Medium","High","Low","High"]
}

df = pd.DataFrame(data)

print("Initial Dataset:\n", df, "\n")
print("Missing Values per Column:\n", df.isnull().sum(), "\n")

df['Occupation'].fillna(df['Occupation'].mode()[0], inplace=True)

df['Satisfaction_Level'].fillna(df['Satisfaction_Level'].mean(), inplace=True)

df['Income'].fillna(df['Income'].median(), inplace=True)

df['Years_Employed'].fillna(df['Years_Employed'].median(), inplace=True)

df['Age'].fillna(df['Age'].mean(), inplace=True)

print("After Handling Missing Values:\n", df, "\n")

def satisfaction_label(x):
    return "High" if x > 0.7 else "Low"

df["Satisfaction_Category"] = df["Satisfaction_Level"].apply(satisfaction_label)


df["Purchase_History_Numeric"] = df["Purchase_History"].map({"High": 2, "Medium": 1, "Low": 0})


df["Zscore_Income"] = np.abs(stats.zscore(df["Income"]))
outliers_z = df[df["Zscore_Income"] > 3]
print("Outliers Detected using Z-Score:\n", outliers_z[["ID","Income","Zscore_Income"]], "\n")

Q1 = df["Income"].quantile(0.25)
Q3 = df["Income"].quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = df[(df["Income"] < Q1 - 1.5*IQR) | (df["Income"] > Q3 + 1.5*IQR)]
print("Outliers Detected using IQR:\n", outliers_iqr[["ID","Income"]], "\n")

print("Final Cleaned and Transformed Dataset:\n", df, "\n")




Initial Dataset:
    ID   Age  Gender    Income       Occupation  Years_Employed  \
0   1  25.0    Male   50000.0         Engineer             2.0   
1   2  37.0  Female   62000.0   Data Scientist             5.0   
2   3   NaN    Male   45000.0              NaN             3.0   
3   4  29.0  Female   54000.0  Product Manager             NaN   
4   5  42.0    Male       NaN            Sales             NaN   
5   6  55.0  Female  100000.0        Executive            30.0   
6   7  23.0    Male   52000.0              NaN             1.0   
7   8  29.0  Female   70000.0          Product             6.0   
8   9   NaN    Male  200000.0          Manager            15.0   
9  10  45.0  Female   85000.0              NaN             NaN   

   Satisfaction_Level Purchase_History  
0                 0.8             High  
1                 NaN           Medium  
2                 0.6              Low  
3                 0.7             High  
4                 0.5              Low  
5        

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Occupation'].fillna(df['Occupation'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Satisfaction_Level'].fillna(df['Satisfaction_Level'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the