In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

print("✅ Libraries imported!")

✅ Libraries imported!


In [2]:
data = {
    'Name': ['Amit', 'Neha', 'Raj', 'Simran', 'Asha', 'Rohit'],
    'Age': [25, 30, np.nan, 22, 28, 35],
    'City': ['Delhi', 'Mumbai', 'Delhi', 'Pune', np.nan, 'Chennai'],
    'Salary': [50000, 60000, 52000, np.nan, 58000, 62000]
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)

Original Data:
     Name   Age     City   Salary
0    Amit  25.0    Delhi  50000.0
1    Neha  30.0   Mumbai  60000.0
2     Raj   NaN    Delhi  52000.0
3  Simran  22.0     Pune      NaN
4    Asha  28.0      NaN  58000.0
5   Rohit  35.0  Chennai  62000.0


In [6]:
# Fill missing numeric values with mean
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())

# Fill missing categorical values with mode (most frequent)
df['City'] = df['City'].fillna(df['City'].mode()[0])

print("✅ After filling missing values:")
print(df)

✅ After filling missing values:
     Name   Age     City   Salary
0    Amit  25.0    Delhi  50000.0
1    Neha  30.0   Mumbai  60000.0
2     Raj  28.0    Delhi  52000.0
3  Simran  22.0     Pune  56400.0
4    Asha  28.0    Delhi  58000.0
5   Rohit  35.0  Chennai  62000.0


In [7]:
le = LabelEncoder()
df['City'] = le.fit_transform(df['City'])

print("✅ After encoding City column:")
print(df)

✅ After encoding City column:
     Name   Age  City   Salary
0    Amit  25.0     1  50000.0
1    Neha  30.0     2  60000.0
2     Raj  28.0     1  52000.0
3  Simran  22.0     3  56400.0
4    Asha  28.0     1  58000.0
5   Rohit  35.0     0  62000.0


In [8]:
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])

print("✅ After scaling numeric data:")
print(df)

✅ After scaling numeric data:
     Name       Age  City    Salary
0    Amit -0.742307     1 -1.514113
1    Neha  0.494872     2  0.851688
2     Raj  0.000000     1 -1.040952
3  Simran -1.484615     3  0.000000
4    Asha  0.000000     1  0.378528
5   Rohit  1.732051     0  1.324849


In [9]:
X = df[['Age', 'City', 'Salary']]
y = np.array([1, 0, 1, 0, 1, 0])  # dummy target variable (e.g., will buy or not buy)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("✅ Training features:\n", X_train)
print("\n✅ Testing features:\n", X_test)


✅ Training features:
         Age  City    Salary
5  1.732051     0  1.324849
2  0.000000     1 -1.040952
4  0.000000     1  0.378528
3 -1.484615     3  0.000000

✅ Testing features:
         Age  City    Salary
0 -0.742307     1 -1.514113
1  0.494872     2  0.851688
