## 1- Merge Data

In [3]:
import pandas as pd

In [187]:
df1 = pd.DataFrame({
    'customer_id': [1,2,3,4],
    'name': ['Ahmed','Ali','Omar','Aya'],
    'courses': ['AI','ML','AI','UI']
})

In [189]:
df2 = pd.DataFrame({
    'customer_id': [2,3,4,5],
    'purchase': ['Laptop','Tab','phone','monitor'],
    'courses': ['ML','AI','UI','web']
})

In [191]:
df1

Unnamed: 0,customer_id,name,courses
0,1,Ahmed,AI
1,2,Ali,ML
2,3,Omar,AI
3,4,Aya,UI


In [193]:
df2

Unnamed: 0,customer_id,purchase,courses
0,2,Laptop,ML
1,3,Tab,AI
2,4,phone,UI
3,5,monitor,web


In [195]:
#1- inner join
inner_join = pd.merge(df1,df2, how ='inner',on= ['customer_id','courses'])

In [197]:
inner_join

Unnamed: 0,customer_id,name,courses,purchase
0,2,Ali,ML,Laptop
1,3,Omar,AI,Tab
2,4,Aya,UI,phone


In [199]:
#2- Right join
right_join = pd.merge(df1,df2, how ='right',on= ['customer_id','courses'])

In [201]:
right_join

Unnamed: 0,customer_id,name,courses,purchase
0,2,Ali,ML,Laptop
1,3,Omar,AI,Tab
2,4,Aya,UI,phone
3,5,,web,monitor


In [203]:
#3- Left join
left_join = pd.merge(df1,df2, how ='left',on= ['customer_id','courses'])

In [205]:
left_join

Unnamed: 0,customer_id,name,courses,purchase
0,1,Ahmed,AI,
1,2,Ali,ML,Laptop
2,3,Omar,AI,Tab
3,4,Aya,UI,phone


In [209]:
#3-outer
outer_join = pd.merge(df1,df2, how ='outer',on= ['customer_id','courses'])

In [211]:
outer_join

Unnamed: 0,customer_id,name,courses,purchase
0,1,Ahmed,AI,
1,2,Ali,ML,Laptop
2,3,Omar,AI,Tab
3,4,Aya,UI,phone
4,5,,web,monitor


## 2- Handle Missing Values

In [260]:
data = pd.DataFrame({
    'Age': [25,30, None ,45,50, None , 65],
    'Blood_Press': [120,130,110,None ,140,145,None],
    'Cholestrol': [200,None,190,210,None,180,220]
})

In [262]:
data

Unnamed: 0,Age,Blood_Press,Cholestrol
0,25.0,120.0,200.0
1,30.0,130.0,
2,,110.0,190.0
3,45.0,,210.0
4,50.0,140.0,
5,,145.0,180.0
6,65.0,,220.0


In [223]:
data.isnull().sum()

Age            2
Blood_Press    2
Cholestrol     2
dtype: int64

### 1- Remove the Null Values

In [226]:
data_copy = data

In [228]:
data_copy

Unnamed: 0,Age,Blood_Press,Cholestrol
0,25.0,120.0,200.0
1,30.0,130.0,
2,,110.0,190.0
3,45.0,,210.0
4,50.0,140.0,
5,,145.0,180.0
6,65.0,,220.0


In [230]:
data_copy = data_copy.dropna() 

In [232]:
data_copy

Unnamed: 0,Age,Blood_Press,Cholestrol
0,25.0,120.0,200.0


In [234]:
data

Unnamed: 0,Age,Blood_Press,Cholestrol
0,25.0,120.0,200.0
1,30.0,130.0,
2,,110.0,190.0
3,45.0,,210.0
4,50.0,140.0,
5,,145.0,180.0
6,65.0,,220.0


In [236]:
df_drop_col = data.dropna(subset = ['Blood_Press'])

In [238]:
df_drop_col

Unnamed: 0,Age,Blood_Press,Cholestrol
0,25.0,120.0,200.0
1,30.0,130.0,
2,,110.0,190.0
4,50.0,140.0,
5,,145.0,180.0


### 2- Imputation

In [241]:
import warnings
warnings.filterwarnings('ignore')

In [250]:
data

Unnamed: 0,Age,Blood_Press,Cholestrol
0,25.0,120.0,200.0
1,30.0,130.0,
2,,110.0,190.0
3,45.0,,210.0
4,50.0,140.0,
5,,145.0,180.0
6,65.0,,220.0


In [254]:
data.Age.value_counts()

Age
25.0    1
30.0    1
45.0    1
50.0    1
65.0    1
Name: count, dtype: int64

In [264]:
data['Age'].fillna(data['Age'].mean(), inplace = True)

In [266]:
data

Unnamed: 0,Age,Blood_Press,Cholestrol
0,25.0,120.0,200.0
1,30.0,130.0,
2,43.0,110.0,190.0
3,45.0,,210.0
4,50.0,140.0,
5,43.0,145.0,180.0
6,65.0,,220.0


In [268]:
data['Blood_Press'].fillna(data['Blood_Press'].mean(), inplace = True)
data['Cholestrol'].fillna(180, inplace = True)

In [270]:
data

Unnamed: 0,Age,Blood_Press,Cholestrol
0,25.0,120.0,200.0
1,30.0,130.0,180.0
2,43.0,110.0,190.0
3,45.0,129.0,210.0
4,50.0,140.0,180.0
5,43.0,145.0,180.0
6,65.0,129.0,220.0


In [272]:
data.isnull().sum()

Age            0
Blood_Press    0
Cholestrol     0
dtype: int64

## Detect + Handle The Outliers

In [299]:
df = pd.DataFrame({
    'values' :[10,12,12,13,14,14,15,16,10000,200]
})

In [301]:
df

Unnamed: 0,values
0,10
1,12
2,12
3,13
4,14
5,14
6,15
7,16
8,10000
9,200


In [305]:
df['z_score']= (df['values']-df['values'].mean())/df['values'].std()

In [307]:
df

Unnamed: 0,values,z_score
0,10,-0.323787
1,12,-0.323152
2,12,-0.323152
3,13,-0.322835
4,14,-0.322518
5,14,-0.322518
6,15,-0.322201
7,16,-0.321883
8,10000,2.845556
9,200,-0.263509


In [285]:
import numpy as np

In [295]:
threshold = 0.9
outliers = df[np.abs(df['z_score'])>threshold ]

In [297]:
outliers

Unnamed: 0,values,z_score
8,100,0.953249
9,200,2.558046


## Encoding

### 1 - Ordinal Number Encoding

In [340]:
data = pd.DataFrame({
    'Temp' : ['Hot','cold','very Hot','cold','warm','Hot','warm']
    })

In [342]:
data

Unnamed: 0,Temp
0,Hot
1,cold
2,very Hot
3,cold
4,warm
5,Hot
6,warm


In [344]:
data.Temp.unique()

array(['Hot', 'cold', 'very Hot', 'warm'], dtype=object)

In [348]:
temp_values = {'cold':1,'warm' :2,'Hot':3,'very Hot':4}
temp_values

{'cold': 1, 'warm': 2, 'Hot': 3, 'very Hot': 4}

In [354]:
#mapping btw temp_values and temp col
data['temp_encoding']= data['Temp'].map(temp_values)

In [356]:
data

Unnamed: 0,Temp,temp_encoding
0,Hot,3
1,cold,1
2,very Hot,4
3,cold,1
4,warm,2
5,Hot,3
6,warm,2


In [360]:
data.drop(['Temp'],axis = 1, inplace = True)

In [362]:
data

Unnamed: 0,temp_encoding
0,3
1,1
2,4
3,1
4,2
5,3
6,2


In [364]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
   ---------------------------------------- 0.0/82.0 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/82.0 kB ? eta -:--:--
   -------------- ------------------------- 30.7/82.0 kB 435.7 kB/s eta 0:00:01
   ----------------------------- ---------- 61.4/82.0 kB 544.7 kB/s eta 0:00:01
   ---------------------------------------- 82.0/82.0 kB 574.1 kB/s eta 0:00:00
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [366]:
import category_encoders as ce

In [368]:
data = pd.DataFrame({
    'Temp' : ['Hot','cold','very Hot','cold','warm','Hot','warm']
    })

In [377]:
ordinal_encoder = ce.OrdinalEncoder(cols= 'Temp',
                                   return_df = True,
                                   mapping =[{
                                       'col': 'Temp',
                                       'mapping':{
                                           'cold':1,
                                           'warm' :2,
                                           'Hot':3,
                                           'very Hot':4
                                       }}])

In [379]:
data.Temp = ordinal_encoder.fit_transform(data)

In [381]:
data

Unnamed: 0,Temp
0,3
1,1
2,4
3,1
4,2
5,3
6,2


In [385]:
from sklearn.preprocessing import OrdinalEncoder

In [389]:
data1 = pd.DataFrame({
    'Temp' : ['Hot','cold','very Hot','cold','warm','Hot','warm']
    })

In [387]:
#call
encoder = OrdinalEncoder()

In [391]:
data1['temp_encoding']= encoder.fit_transform(data1)

In [393]:
data1

Unnamed: 0,Temp,temp_encoding
0,Hot,0.0
1,cold,1.0
2,very Hot,2.0
3,cold,1.0
4,warm,3.0
5,Hot,0.0
6,warm,3.0


## Label Encoder

In [396]:
df = pd.read_csv(r"D:\Work\Route\HTI\s6\50_Startups.csv")

In [398]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [400]:
df.State.unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [402]:
df.State.value_counts()

State
New York      17
California    17
Florida       16
Name: count, dtype: int64

In [404]:
from sklearn.preprocessing import LabelEncoder

In [406]:
#call
label_encoder = LabelEncoder()

In [408]:
df['State']= label_encoder.fit_transform(df['State'])

In [410]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [412]:
df.State.unique()

array([2, 0, 1])

## One Hot Encoder

In [450]:
df = pd.read_csv(r"D:\Work\Route\HTI\s6\50_Startups.csv")

In [452]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [454]:
from sklearn.preprocessing import OneHotEncoder

In [456]:
#call
one_h_enc = OneHotEncoder()

In [448]:
encoding_df = pd.DataFrame(one_h_enc.fit_transform(df[['State']]).toarray())

In [460]:
encoding_df.head()

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0


In [462]:
df = df.join(encoding_df)

In [466]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,0,1,2
0,165349.2,136897.8,471784.1,New York,192261.83,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,California,191792.06,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,Florida,191050.39,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,New York,182901.99,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,Florida,166187.94,0.0,1.0,0.0


In [468]:
df.drop(['State'],axis = 1 , inplace = True)

In [470]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,0,1,2
0,165349.2,136897.8,471784.1,192261.83,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,191792.06,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,191050.39,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,182901.99,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,166187.94,0.0,1.0,0.0


## Binary Encoder

In [481]:
import category_encoders as ce

In [483]:
data = pd.DataFrame({
    'Gender':['m','f','m','m','f'],
    'class': ['A','B','C','D','A'],
    'City': ['cairo','giza','cairo','giza','giza']
})

In [485]:
data

Unnamed: 0,Gender,class,City
0,m,A,cairo
1,f,B,giza
2,m,C,cairo
3,m,D,giza
4,f,A,giza


In [489]:
data['class'].unique()

array(['A', 'B', 'C', 'D'], dtype=object)

In [491]:
binary_enc = ce.BinaryEncoder(cols =['class'] )

In [495]:
new_df = binary_enc.fit_transform(data)

In [497]:
new_df

Unnamed: 0,Gender,class_0,class_1,class_2,City
0,m,0,0,1,cairo
1,f,0,1,0,giza
2,m,0,1,1,cairo
3,m,1,0,0,giza
4,f,0,0,1,giza


In [None]:
1 ---> 001
2 ---> 010
3 ---> 011
4 ---> 100

## Target Encoder

In [528]:
data = pd.DataFrame({
    'Names':['omer','ahmed','moaz','ahmed','Jony','ziada'],
    'Markes': [100,300,200,307,170,480]
})

In [530]:
data

Unnamed: 0,Names,Markes
0,omer,100
1,ahmed,300
2,moaz,200
3,ahmed,307
4,Jony,170
5,ziada,480


In [532]:
import category_encoders as ce

In [534]:
target_enc = ce.TargetEncoder(cols = 'Names')

In [536]:
data = target_enc.fit_transform(data['Names'],data['Markes'])

In [538]:
data

Unnamed: 0,Names
0,238.747698
1,265.741447
2,251.758546
3,265.741447
4,247.855292
5,288.188919


## Get_Dummies

In [553]:
data = pd.DataFrame({
    'Gender':['m','f','m','m','f'],
    'class': ['A','B','C','D','A'],
    'City': ['cairo','giza','cairo','giza','giza']
})

In [555]:
data

Unnamed: 0,Gender,class,City
0,m,A,cairo
1,f,B,giza
2,m,C,cairo
3,m,D,giza
4,f,A,giza


In [561]:
data = pd.get_dummies(data,columns = ['Gender'], drop_first = True)

In [563]:
data

Unnamed: 0,class,City,Gender_m
0,A,cairo,True
1,B,giza,False
2,C,cairo,True
3,D,giza,True
4,A,giza,False
