In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Read the Diabetes CSV file
df = pd.read_csv('diabetes.csv')

In [5]:
# Check basic statistics to identify potential outliers
print("Basic statistics of dataset:")
print(df.describe())

Basic statistics of dataset:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min   

In [6]:
# Define a function to remove outliers using IQR method
def remove_outliers_iqr(dataframe, column):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    q1, q3 = np.percentile(dataframe[column], [25, 75])

    # Calculate Interquartile Range (IQR)
    iqr = q3 - q1

    # Define lower and upper bounds
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr

    # Filter data points within bounds
    clean_df = dataframe[(dataframe[column] >= lower) & (dataframe[column] <= upper)]

    print(f"\nColumn: {column} | Original rows: {len(dataframe)}, After removing outliers: {len(clean_df)}")
    return clean_df

# Apply outlier removal for numeric columns
numeric_columns = df.select_dtypes(include=np.number).columns
for col in numeric_columns:
    df = remove_outliers_iqr(df, col)



Column: Pregnancies | Original rows: 768, After removing outliers: 764

Column: Glucose | Original rows: 764, After removing outliers: 759

Column: BloodPressure | Original rows: 759, After removing outliers: 714

Column: SkinThickness | Original rows: 714, After removing outliers: 713

Column: Insulin | Original rows: 713, After removing outliers: 686

Column: BMI | Original rows: 686, After removing outliers: 676

Column: DiabetesPedigreeFunction | Original rows: 676, After removing outliers: 647

Column: Age | Original rows: 647, After removing outliers: 636

Column: Outcome | Original rows: 636, After removing outliers: 636


In [7]:
# Verify cleaned dataset
print("\nDataset after outlier removal:")
print(df.describe())


Dataset after outlier removal:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   636.000000  636.000000     636.000000     636.000000  636.000000   
mean      3.786164  118.869497      72.007862      20.745283   69.017296   
std       3.251388   28.996773      11.301102      15.231688   83.601857   
min       0.000000   44.000000      38.000000       0.000000    0.000000   
25%       1.000000   99.000000      64.000000       0.000000    0.000000   
50%       3.000000  114.000000      72.000000      23.000000   42.500000   
75%       6.000000  136.250000      80.000000      32.000000  122.000000   
max      13.000000  198.000000     106.000000      60.000000  330.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  636.000000                636.000000  636.000000  636.000000  
mean    31.964151                  0.426237   32.555031    0.309748  
std      6.384132                  0.245088   10.879807    0.462754  
min

In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Read the Diabetes CSV file
df = pd.read_csv('diabetes.csv')


In [11]:
# Select numeric columns for scaling
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
print("Numeric columns to scale:", numeric_columns)

Numeric columns to scale: Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [12]:
# Apply Min-Max Normalization (scales values to [0,1])
min_max_scaler = MinMaxScaler()
df_minmax = df.copy()
df_minmax[numeric_columns] = min_max_scaler.fit_transform(df[numeric_columns])
print("\nDataset after MinMaxScaler normalization:")
df_minmax.head(10)


Dataset after MinMaxScaler normalization:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,1.0
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0.0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1.0
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1.0
5,0.294118,0.582915,0.606557,0.0,0.0,0.38152,0.052519,0.15,0.0
6,0.176471,0.39196,0.409836,0.323232,0.104019,0.461997,0.072588,0.083333,1.0
7,0.588235,0.577889,0.0,0.0,0.0,0.52608,0.023911,0.133333,0.0
8,0.117647,0.98995,0.57377,0.454545,0.641844,0.454545,0.034159,0.533333,1.0
9,0.470588,0.628141,0.786885,0.0,0.0,0.0,0.065756,0.55,1.0


In [13]:
# Apply Standardization (mean=0, std=1)
standard_scaler = StandardScaler()
df_standard = df.copy()
df_standard[numeric_columns] = standard_scaler.fit_transform(df[numeric_columns])
print("\nDataset after StandardScaler standardization:")
df_standard.head(10)


Dataset after StandardScaler standardization:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.639947,0.848324,0.149641,0.90727,-0.692891,0.204013,0.468492,1.425995,1.365896
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672,-0.73212
2,1.23388,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584,1.365896
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549,-0.73212
4,-1.141852,0.504055,-1.504687,0.90727,0.765836,1.409746,5.484909,-0.020496,1.365896
5,0.342981,-0.153185,0.253036,-1.288212,-0.692891,-0.811341,-0.818079,-0.27576,-0.73212
6,-0.250952,-1.342476,-0.98771,0.719086,0.071204,-0.125977,-0.676133,-0.616111,1.365896
7,1.827813,-0.184482,-3.572597,-1.288212,-0.692891,0.419775,-1.020427,-0.360847,-0.73212
8,-0.547919,2.381884,0.046245,1.534551,4.021922,-0.189437,-0.947944,1.681259,1.365896
9,1.23388,0.128489,1.390387,-1.288212,-0.692891,-4.060474,-0.724455,1.766346,1.365896


# **Activity 7 – Outlier Detection and Normalization in Employees.csv**

In [16]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Read the Employees CSV file
df = pd.read_csv('employees.csv')

In [15]:
# Check basic statistics to identify potential outliers
print("Basic statistics of dataset:")
print(df.describe())

Basic statistics of dataset:
              Salary      Bonus %
count    1000.000000  1000.000000
mean    90662.181000    10.207555
std     32923.693342     5.528481
min     35013.000000     1.015000
25%     62613.000000     5.401750
50%     90428.000000     9.838500
75%    118740.250000    14.838000
max    149908.000000    19.944000


In [18]:
# Remove outliers using the IQR method for numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns

for col in numeric_columns:
    q1 = np.percentile(df[col], 25)
    q3 = np.percentile(df[col], 75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    df = df[(df[col] >= lower) & (df[col] <= upper)]
    print(f"\nColumn: {col} | Rows after removing outliers: {len(df)}")


Column: Salary | Rows after removing outliers: 1000

Column: Bonus % | Rows after removing outliers: 1000


In [22]:
# Normalize numeric columns using MinMaxScaler
scaler = MinMaxScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Display the first few rows after normalization
print("\nDataset after normalization:")
df['Salary'].head(10)


Dataset after normalization:


Unnamed: 0,Salary
0,0.542191
1,0.234301
2,0.831864
3,0.902494
4,0.574359
5,0.697593
6,0.265138
7,0.094808
8,0.527064
9,0.912477


In [34]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

# Load the dataset
df = pd.read_csv('adult.csv')
print("First 5 rows of the dataset:")
df.head()

First 5 rows of the dataset:


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [35]:
# information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [36]:
# List of categorical columns (object dtype)
categorical_cols = df.select_dtypes(include='object').columns
print("Categorical columns in dataset:", categorical_cols)

# Check value counts for each categorical column
for col in categorical_cols:
    print(f"\nValue counts for '{col}':")
    print(df[col].value_counts())

Categorical columns in dataset: Index(['workclass', 'education', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'native.country', 'income'],
      dtype='object')

Value counts for 'workclass':
workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64

Value counts for 'education':
education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64

Value counts for 'marital.status':
marital.status
Married-civ-spouse       14976
Never-marrie

In [37]:
# Replace '?' with 'Unknown' in relevant columns
df['workclass'] = df['workclass'].replace('?', 'Unknown')
df['occupation'] = df['occupation'].replace('?', 'Unknown')
df['native.country'] = df['native.country'].replace('?', 'Unknown')

In [38]:
# Label Encoding for target column 'income'
le = LabelEncoder()
df['income_encoded'] = le.fit_transform(df['income'])
print("\nLabel Encoding mapping for 'income':")
print(dict(zip(le.classes_, le.transform(le.classes_))))
print(df[['income', 'income_encoded']].head())


Label Encoding mapping for 'income':
{'<=50K': np.int64(0), '>50K': np.int64(1)}
  income  income_encoded
0  <=50K               0
1  <=50K               0
2  <=50K               0
3  <=50K               0
4  <=50K               0


In [39]:
# Ordinal Encoding for 'education' (ordered categories)
education_order = [['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th',
                    '11th', '12th', 'HS-grad', 'Some-college', 'Assoc-voc',
                    'Assoc-acdm', 'Bachelors', 'Masters', 'Prof-school', 'Doctorate']]

oe = OrdinalEncoder(categories=education_order)
df['education_ord'] = oe.fit_transform(df[['education']])
print("\nOrdinal Encoding for 'education':")
print(df[['education', 'education_ord']].head())


Ordinal Encoding for 'education':
      education  education_ord
0       HS-grad            8.0
1       HS-grad            8.0
2  Some-college            9.0
3       7th-8th            3.0
4  Some-college            9.0


In [41]:
# One-Hot Encoding for remaining nominal categorical columns
nominal_cols = ['workclass', 'marital.status', 'occupation',
                'relationship', 'race', 'sex', 'native.country']

ohe = OneHotEncoder(sparse_output=False)
ohe_array = ohe.fit_transform(df[nominal_cols])
ohe_df = pd.DataFrame(ohe_array, columns=ohe.get_feature_names_out(nominal_cols))

# Concatenate OHE columns to original DataFrame
df_encoded = pd.concat([df.reset_index(drop=True), ohe_df], axis=1)
print("\nDataset after One-Hot Encoding:")
df_encoded.head()


Dataset after One-Hot Encoding:


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,...,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Unknown,native.country_Vietnam,native.country_Yugoslavia
0,90,Unknown,77053,HS-grad,9,Widowed,Unknown,Not-in-family,White,Female,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,66,Unknown,186061,Some-college,10,Widowed,Unknown,Unmarried,Black,Female,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [43]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

# Load the Employees dataset
df = pd.read_csv('employees.csv')
print("First 5 rows of the dataset:")
df.head()

# Identify categorical columns
categorical_cols = df.select_dtypes(include='object').columns
print("\nCategorical columns in dataset:", categorical_cols)

First 5 rows of the dataset:

Categorical columns in dataset: Index(['First Name', 'Gender', 'Start Date', 'Last Login Time',
       'Senior Management', 'Team'],
      dtype='object')


In [45]:
# Handle missing values if any
df['Gender'] = df['Gender'].fillna('Unknown')
df['Senior Management'] = df['Senior Management'].fillna('No')
df['Team'] = df['Team'].fillna('Unknown')

In [46]:
# Label Encoding for binary categorical column (example: 'Gender')
le = LabelEncoder()
df['Gender_encoded'] = le.fit_transform(df['Gender'])
print("\nLabel Encoding mapping for 'Gender':")
print(dict(zip(le.classes_, le.transform(le.classes_))))
print(df[['Gender', 'Gender_encoded']].head())


Label Encoding mapping for 'Gender':
{'Female': np.int64(0), 'Male': np.int64(1), 'Unknown': np.int64(2)}
   Gender  Gender_encoded
0    Male               1
1    Male               1
2  Female               0
3    Male               1
4    Male               1


In [48]:
# One-Hot Encoding for nominal column 'Team'
ohe = OneHotEncoder(sparse_output=False)
ohe_array = ohe.fit_transform(df[['Team']])
ohe_df = pd.DataFrame(ohe_array, columns=ohe.get_feature_names_out(['Team']))

In [52]:
# Concatenate OHE columns to original DataFrame
df_encoded = pd.concat([df.reset_index(drop=True), ohe_df], axis=1)
print("\nDataset after One-Hot Encoding for 'Team':")
df_encoded.head()


Dataset after One-Hot Encoding for 'Team':


Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team,Gender_encoded,Team_Business Development,Team_Client Services,Team_Distribution,Team_Engineering,Team_Finance,Team_Human Resources,Team_Legal,Team_Marketing,Team_Product,Team_Sales,Team_Unknown
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,,Marketing,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,,Unknown,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,,Finance,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,,Finance,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,,Client Services,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# Import libraries
import pandas as pd
from collections import Counter

# Load the Diabetes dataset
df = pd.read_csv('diabetes.csv')

# Separate features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

print("Original class distribution:", Counter(y))

Original class distribution: Counter({0: 500, 1: 268})


In [54]:
# Oversampling Techniques
from imblearn.over_sampling import RandomOverSampler, SMOTE

# Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)
print("\nRandomOverSampler class distribution:", Counter(y_ros))

# SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)
print("SMOTE class distribution:", Counter(y_smote))


RandomOverSampler class distribution: Counter({1: 500, 0: 500})
SMOTE class distribution: Counter({1: 500, 0: 500})


In [55]:
# Undersampling Techniques
from imblearn.under_sampling import RandomUnderSampler, NearMiss

# Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)
print("\nRandomUnderSampler class distribution:", Counter(y_rus))

# NearMiss
nm = NearMiss()
X_nm, y_nm = nm.fit_resample(X, y)
print("NearMiss class distribution:", Counter(y_nm))



RandomUnderSampler class distribution: Counter({0: 268, 1: 268})
NearMiss class distribution: Counter({0: 268, 1: 268})
