In [2]:
# importing libraries 
import numpy as np 
import pandas as pd
import matplotlib as plt 

In [3]:
# dataset read 
data = pd.read_csv('data.csv')
df = pd.DataFrame(data)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,India,34.0,92000.0,Yes
1,Sri lanka,22.0,25000.0,Yes
2,China,31.0,74000.0,Yes
3,Sri lanka,29.0,,No
4,China,55.0,98000.0,Yes
5,India,24.0,30000.0,No
6,Sri lanka,28.0,40000.0,No
7,India,,60000.0,No
8,China,51.0,89000.0,Yes
9,India,44.0,78000.0,Yes


1. Imputation :- This code will replace missing 'Salary' values with the mean of the available 'Salary' values.

In [4]:
# Perform mean imputation for the 'Salary' column
mean_salary = df['Salary'].mean()
mean_age = df['Age'].mean()
df['Salary'].fillna(mean_salary, inplace=True)
df['Age'].fillna(mean_age, inplace=True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,India,34.0,92000.0,Yes
1,Sri lanka,22.0,25000.0,Yes
2,China,31.0,74000.0,Yes
3,Sri lanka,29.0,54857.142857,No
4,China,55.0,98000.0,Yes
5,India,24.0,30000.0,No
6,Sri lanka,28.0,40000.0,No
7,India,33.714286,60000.0,No
8,China,51.0,89000.0,Yes
9,India,44.0,78000.0,Yes


In [5]:
# set decimal precision to 2
df['Salary'] = df['Salary'].round(2)
df['Age'] = (df['Age']).astype(int)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,India,34,92000.0,Yes
1,Sri lanka,22,25000.0,Yes
2,China,31,74000.0,Yes
3,Sri lanka,29,54857.14,No
4,China,55,98000.0,Yes
5,India,24,30000.0,No
6,Sri lanka,28,40000.0,No
7,India,33,60000.0,No
8,China,51,89000.0,Yes
9,India,44,78000.0,Yes


2. Anomaly Detection - 
We calculate the z-scores for the 'Age' and 'Salary' columns using the formula: (x - mean) / standard_deviation, where x is the value of the data point, mean is the mean of the column, and standard_deviation is the standard deviation of the column.
We add two new columns, 'Age_ZScore' and 'Salary_ZScore', to the DataFrame to store the calculated z-scores.
Now, the DataFrame df contains the original data along with the z-scores for the 'Age' and 'Salary' columns, which indicate how many standard deviations each data point is away from the mean in their respective columns.

In [6]:
new_data = {
    'Country': 'pakistan',
    'Age': 140,
    'Salary': 1000000,
    'Purchased': 'No',
}
df = df.append(new_data, ignore_index=True)
df1 =df.copy()

  df = df.append(new_data, ignore_index=True)


In [7]:
# # Calculate the z-scores for the 'Age' and 'Salary' columns
df1['Age_ZScore'] = (df['Age'] - df['Age'].mean()) / df['Age'].std(ddof=0)
df1['Salary_ZScore'] = (df['Salary'] - df['Salary'].mean()) / df['Salary'].std(ddof=0)
df1

Unnamed: 0,Country,Age,Salary,Purchased,Age_ZScore,Salary_ZScore
0,India,34,92000.0,Yes,-0.229931,-0.095272
1,Sri lanka,22,25000.0,Yes,-0.667028,-0.386363
2,China,31,74000.0,Yes,-0.339205,-0.173475
3,Sri lanka,29,54857.14,No,-0.412055,-0.256644
4,China,55,98000.0,Yes,0.534988,-0.069204
5,India,24,30000.0,No,-0.594179,-0.364639
6,Sri lanka,28,40000.0,No,-0.44848,-0.321193
7,India,33,60000.0,No,-0.266356,-0.2343
8,China,51,89000.0,Yes,0.389289,-0.108306
9,India,44,78000.0,Yes,0.134316,-0.156097


In [8]:
# Find the tuples with the maximum absolute z-score values
max_abs_age_zscore = df1['Age_ZScore'].abs().max()
max_abs_salary_zscore = df1['Salary_ZScore'].abs().max()

# Filter the DataFrame to include only the tuples with maximum absolute z-score values
max_abs_age_tuples = df1[df1['Age_ZScore'].abs() == max_abs_age_zscore]
max_abs_salary_tuples = df1[df1['Salary_ZScore'].abs() == max_abs_salary_zscore]

max_abs_age_tuples

Unnamed: 0,Country,Age,Salary,Purchased,Age_ZScore,Salary_ZScore
15,pakistan,140,1000000.0,No,3.631091,3.849661


In [9]:
max_abs_salary_tuples

Unnamed: 0,Country,Age,Salary,Purchased,Age_ZScore,Salary_ZScore
15,pakistan,140,1000000.0,No,3.631091,3.849661


hence Anomaly detected for this given tuple with salary and age deviates from normal data according to z_scores

3. Standardization 
Standardization, also known as z-score normalization or feature scaling, is a data preprocessing technique used to transform the data in a way that it has a mean of 0 and a standard deviation of 1. This process is applied to numerical features in a dataset and is particularly useful when you have features with different units or scales.

The formula for standardization (z-score) of a data point x is:
z = (x - μ) / σ


In [10]:
# # Calculate the z-scores for the 'Age' and 'Salary' columns
df2 = df.copy()
df2['Age'] = (df['Age'] - df['Age'].mean()) / df['Age'].std(ddof=0)
df2['Salary'] = (df['Salary'] - df['Salary'].mean()) / df['Salary'].std(ddof=0)
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,India,-0.229931,-0.095272,Yes
1,Sri lanka,-0.667028,-0.386363,Yes
2,China,-0.339205,-0.173475,Yes
3,Sri lanka,-0.412055,-0.256644,No
4,China,0.534988,-0.069204,Yes
5,India,-0.594179,-0.364639,No
6,Sri lanka,-0.44848,-0.321193,No
7,India,-0.266356,-0.2343,No
8,China,0.389289,-0.108306,Yes
9,India,0.134316,-0.156097,Yes


4. Normalization

In [11]:
# Normalize 'Age' and 'Salary' columns using Min-Max scaling
df3 = df.copy()
df3['Age'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min())
df3['Salary'] = (df['Salary'] - df['Salary'].min()) / (df['Salary'].max() - df['Salary'].min())
df3

Unnamed: 0,Country,Age,Salary,Purchased
0,India,0.109244,0.073469,Yes
1,Sri lanka,0.008403,0.005102,Yes
2,China,0.084034,0.055102,Yes
3,Sri lanka,0.067227,0.035569,No
4,China,0.285714,0.079592,Yes
5,India,0.02521,0.010204,No
6,Sri lanka,0.058824,0.020408,No
7,India,0.10084,0.040816,No
8,China,0.252101,0.070408,Yes
9,India,0.193277,0.059184,Yes


Encoding

In [12]:
df_encoded = pd.get_dummies(df, columns=['Country', 'Purchased'], drop_first=True)
df_encoded

Unnamed: 0,Age,Salary,Country_India,Country_Sri lanka,Country_pakistan,Purchased_Yes
0,34,92000.0,1,0,0,1
1,22,25000.0,0,1,0,1
2,31,74000.0,0,0,0,1
3,29,54857.14,0,1,0,0
4,55,98000.0,0,0,0,1
5,24,30000.0,1,0,0,0
6,28,40000.0,0,1,0,0
7,33,60000.0,1,0,0,0
8,51,89000.0,0,0,0,1
9,44,78000.0,1,0,0,1
