In [7]:
import pandas as pd

data = {
    'Name': ['Pavan', 'Kapil', 'Lalit', 'Ishan', 'Om'],
    'Age': [25, None, 44, 23, None],
    'Salary': [50000, 60000, 70000, None, None]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

print(df.isnull().sum())
print(df.isnull().mean() * 100)

df_drop = df.dropna()
print(df_drop)

df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)
print(df)

Original DataFrame:
    Name   Age   Salary
0  Pavan  25.0  50000.0
1  Kapil   NaN  60000.0
2  Lalit  44.0  70000.0
3  Ishan  23.0      NaN
4     Om   NaN      NaN
Name      0
Age       2
Salary    2
dtype: int64
Name       0.0
Age       40.0
Salary    40.0
dtype: float64
    Name   Age   Salary
0  Pavan  25.0  50000.0
2  Lalit  44.0  70000.0
    Name        Age   Salary
0  Pavan  25.000000  50000.0
1  Kapil  30.666667  60000.0
2  Lalit  44.000000  70000.0
3  Ishan  23.000000  60000.0
4     Om  30.666667  60000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)


In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

df = pd.read_csv('data/sample_data.csv')

df_label = df.copy()

le = LabelEncoder()
df_label['Gender_Encoded'] = le.fit_transform(df_label['Gender'])
df_label['Passed_Encoded'] = le.fit_transform(df_label['Passed'])

# print('\nLabel Encoded DataFrame:')
# print(df_label[['Name', 'Gender', 'Gender_Encoded', 'Passed', 'Passed_Encoded']])


df_encoded = pd.get_dummies(df_label, columns=['City'])
print('\nOne Encoded DataFrame:')
print(df_encoded.head())


One Encoded DataFrame:
     Name  Gender Passed  Gender_Encoded  Passed_Encoded  City_ Mumbai  \
0    Aman    Male    Yes               1               1         False   
1   Priya  Female    Yes               0               1         False   
2   Rahul    Male     No               1               0         False   
3  Anjali  Female    Yes               0               1          True   
4    Ravi    Male    Yes               1               1         False   

   City_Bangalore  City_Chennai  City_Delhi  City_Mumbai  
0           False         False        True        False  
1           False         False       False         True  
2            True         False       False        False  
3           False         False       False        False  
4           False         False        True        False  


In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

data = {
    'StudyHours': [1, 2, 3, 4, 5],
    'TestScore': [40, 50, 60, 70, 80],
}

df = pd.DataFrame(data)

#standardscalar
standard_scaler = StandardScaler()
standard_scaled = standard_scaler.fit_transform(df)

print("Standard Scaled Data:")
print(pd.DataFrame(standard_scaled, columns = ['StudyHours', 'TestScore']))

#minmaxscalar
minmax_scaler = MinMaxScaler()
minmax_scaled = minmax_scaler.fit_transform(df)

print("\nMin-Max Scaled Data:")
print(pd.DataFrame(minmax_scaled, columns = ['StudyHours', 'TestScore']))


#train-test split
X = df[['StudyHours']]
y = df[['TestScore']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Data")
print(X_train)

print("Test Data")
print(X_test)

print("Training Data")
print(y_train)

print("Test Data")
print(y_test)



Standard Scaled Data:
   StudyHours  TestScore
0   -1.414214  -1.414214
1   -0.707107  -0.707107
2    0.000000   0.000000
3    0.707107   0.707107
4    1.414214   1.414214

Min-Max Scaled Data:
   StudyHours  TestScore
0        0.00       0.00
1        0.25       0.25
2        0.50       0.50
3        0.75       0.75
4        1.00       1.00
Training Data
   StudyHours
4           5
2           3
0           1
3           4
Test Data
   StudyHours
1           2
Training Data
   TestScore
4         80
2         60
0         40
3         70
Test Data
   TestScore
1         50
