In [1]:
import pandas as pd
import numpy as np

Handling Missing Values

In [40]:
df = pd.DataFrame({
    'Age' : [22,None,35,40,None,22],
    'Salary': [3000,3500,None,4000,42000,3000],
    'Gender': ['M','F',None,'M','F','M']
})

In [4]:
df


Unnamed: 0,Age,Salary,Gender
0,22.0,3000.0,M
1,,3500.0,F
2,35.0,,
3,40.0,4000.0,M
4,,42000.0,F
5,22.0,3000.0,M


In [41]:
dup = df.copy()

In [7]:
dup

Unnamed: 0,Age,Salary,Gender
0,22.0,3000.0,M
1,,3500.0,F
2,35.0,,
3,40.0,4000.0,M
4,,42000.0,F
5,22.0,3000.0,M


In [9]:
df.isnull().sum()

Age       2
Salary    1
Gender    1
dtype: int64

In [63]:
df = df.dropna()

In [42]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [43]:
df['Salary'] = df['Salary'].fillna(df['Salary'].median())

In [44]:
df

Unnamed: 0,Age,Salary,Gender
0,22.0,3000.0,M
1,29.75,3500.0,F
2,35.0,3500.0,
3,40.0,4000.0,M
4,29.75,42000.0,F
5,22.0,3000.0,M


In [45]:
df['Gender']=df['Gender'].fillna(df['Gender'].mode()[0])

In [20]:
np.mean(df['Salary'])

11100.0

In [None]:
np.median(df['Salary'])

nan

In [None]:
df['Gender']df['Gender'].isnull()

0    False
1    False
2     True
3    False
4    False
5    False
Name: Gender, dtype: bool

Handling Duplicates

In [65]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [66]:
df

Unnamed: 0,Age,Salary,Gender
0,22.0,3000.0,M
1,29.75,3500.0,F
3,40.0,4000.0,M
4,29.75,42000.0,F


Handling Outliers

In [50]:
Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)

IQR = Q3-Q1


In [51]:
Q1

3500.0

In [52]:
Q3

4000.0

In [53]:
IQR

500.0

In [56]:
df = df[(df['Salary'] >= Q1-(1.5*IQR)) & (df['Salary']<=Q3+(1.5*IQR))]

Normalizing/Scaling  or Feature Scaling

In [67]:
df

Unnamed: 0,Age,Salary,Gender
0,22.0,3000.0,M
1,29.75,3500.0,F
3,40.0,4000.0,M
4,29.75,42000.0,F


In [None]:
mi_sal = 3000
max_sal = 42000

val-min_val/(max_val-min_val)



In [None]:
(3500-3000)/(42000.0-3000.0) ##0.01282051282051282

0.01282051282051282

In [74]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df[['Age','Salary']] = scaler.fit_transform(df[['Age','Salary']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Age','Salary']] = scaler.fit_transform(df[['Age','Salary']])


In [75]:
df

Unnamed: 0,Age,Salary,Gender
0,0.0,0.0,M
1,0.430556,0.012821,F
3,1.0,0.025641,M
4,0.430556,1.0,F


In [None]:
!pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [76]:
pd.get_dummies(df,columns=['Gender'])

Unnamed: 0,Age,Salary,Gender_F,Gender_M
0,0.0,0.0,False,True
1,0.430556,0.012821,True,False
3,1.0,0.025641,False,True
4,0.430556,1.0,True,False


Feature Selection

In [34]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse_output=False)
encoded = ohe.fit_transform(df[['Gender']])

In [38]:
encoded

array([[1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.]])

In [37]:
encoded_df = pd.DataFrame(encoded,columns=ohe.get_feature_names_out(['Gender']))
encoded_df

Unnamed: 0,Gender_M
0,1.0
1,0.0
2,1.0
3,1.0
4,0.0
5,1.0


In [46]:
df = pd.concat([df.drop('Gender',axis=1),encoded_df],axis=1)
df

Unnamed: 0,Age,Salary,Gender_M
0,22.0,3000.0,1.0
1,29.75,3500.0,0.0
2,35.0,3500.0,1.0
3,40.0,4000.0,1.0
4,29.75,42000.0,0.0
5,22.0,3000.0,1.0


In [47]:
from sklearn.feature_selection import SelectKBest, f_regression

X = df.drop('Salary',axis=1)
y = df['Salary']



In [54]:
selected = SelectKBest(f_regression,k=3)

In [55]:
selected.fit(X,y)



In [56]:
X[X.columns[selected.get_support()]]

Unnamed: 0,Age,Gender_M
0,22.0,1.0
1,29.75,0.0
2,35.0,1.0
3,40.0,1.0
4,29.75,0.0
5,22.0,1.0


Handling imbalanced data - SMOTE - Synthetic Minority Oversampling Technique

In [57]:
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification

In [76]:
X, y = make_classification(n_samples=500,n_features=4,weights=[0.98,0.01],random_state=42)

In [77]:
print("Before SMOTE:",sum(y==0),sum(y==1))

Before SMOTE: 491 9


In [67]:
smote = SMOTE(random_state=42,k_neighbors=3)
x_res, y_res = smote.fit_resample(X,y)

In [68]:
print("After SMOTE",sum(y_res==0),sum(y_res==1))

After SMOTE 448 448


In [69]:
x_res,y_res

(array([[-1.707003  ,  0.01943156, -1.9898895 , -1.78947918],
        [-1.42668281,  0.00631852, -1.59144132, -1.52804708],
        [-0.45989355, -0.017192  , -0.37410221, -0.55542162],
        ...,
        [-0.9408384 , -0.48004057,  2.44824225, -2.59041272],
        [-0.86250144, -0.34653596,  1.56873251, -2.06898939],
        [-0.73609008, -0.31098776,  1.44891132, -1.81557023]]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0

In [65]:
df.to_csv("Processed.csv",index=False)

In [78]:
!pip install aws

Collecting aws
  Downloading aws-0.2.5.tar.gz (5.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting boto (from aws)
  Downloading boto-2.49.0-py2.py3-none-any.whl.metadata (7.3 kB)
Collecting fabric>=1.6 (from aws)
  Downloading fabric-3.2.2-py3-none-any.whl.metadata (3.5 kB)
Collecting prettytable>=0.7 (from aws)
  Downloading prettytable-3.16.0-py3-none-any.whl.metadata (33 kB)
Collecting invoke>=2.0 (from fabric>=1.6->aws)
  Downloading invoke-2.2.1-py3-none-any.whl.metadata (3.3 kB)
Collecting deprecated>=1.2 (from fabric>=1.6->aws)
  Downloading deprecated-1.3.1-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading fabric-3.2.2-py3-none-any.whl (59 kB)
   ---------------------------------------- 0.0/59.4 kB ? eta -:--:--
   ---------------------------------------- 59.4/59.4 kB 1.6 MB/s eta 0:00:00
Downloading prettytable-3.16.0-py3-none-any.whl (33 kB)
Downloading boto-2.49.0-py2.py3-none-any.whl (1.4 MB)
   --------