In [3]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("/content/Life Expectancy Data.csv")

# Show first 5 rows
print("First 5 rows of the dataset:")
print(df.head())

# Dataset info (columns, dtypes, missing values)
print("\nDataset Info:")
print(df.info())

# Check shape (rows, columns)
print("\nDataset Shape:")
print(df.shape)

# Check missing values
print("\nMissing Values per Column:")
print(df.isnull().sum())


First 5 rows of the dataset:
       Country  Year      Status  Life expectancy   Adult Mortality  \
0  Afghanistan  2015  Developing              65.0            263.0   
1  Afghanistan  2014  Developing              59.9            271.0   
2  Afghanistan  2013  Developing              59.9            268.0   
3  Afghanistan  2012  Developing              59.5            272.0   
4  Afghanistan  2011  Developing              59.2            275.0   

   infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles   ...  \
0             62     0.01               71.279624         65.0      1154  ...   
1             64     0.01               73.523582         62.0       492  ...   
2             66     0.01               73.219243         64.0       430  ...   
3             69     0.01               78.184215         67.0      2787  ...   
4             71     0.01                7.097109         68.0      3013  ...   

   Polio  Total expenditure  Diphtheria    HIV/AIDS      

In [4]:


# Check missing values
print(df.isnull().sum())


Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64


In [5]:
# Convert categorical columns
df['Country'] = df['Country'].astype('category')
df['Status'] = df['Status'].astype('category')

# Ensure Year is integer
df['Year'] = df['Year'].astype(int)

print(df.dtypes)


Country                            category
Year                                  int64
Status                             category
Life expectancy                     float64
Adult Mortality                     float64
infant deaths                         int64
Alcohol                             float64
percentage expenditure              float64
Hepatitis B                         float64
Measles                               int64
 BMI                                float64
under-five deaths                     int64
Polio                               float64
Total expenditure                   float64
Diphtheria                          float64
 HIV/AIDS                           float64
GDP                                 float64
Population                          float64
 thinness  1-19 years               float64
 thinness 5-9 years                 float64
Income composition of resources     float64
Schooling                           float64
dtype: object


In [6]:
# Separate numeric and categorical columns
num_cols = df.select_dtypes(include=np.number).columns
cat_cols = df.select_dtypes(exclude=np.number).columns

# Fill numeric columns with median (robust to outliers)
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill categorical columns with mode
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# Check again for missing values
print("\nMissing values after imputation:")
print(df.isnull().sum())



Missing values after imputation:
Country                            0
Year                               0
Status                             0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
 BMI                               0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
 HIV/AIDS                          0
GDP                                0
Population                         0
 thinness  1-19 years              0
 thinness 5-9 years                0
Income composition of resources    0
Schooling                          0
dtype: int64


In [7]:
# Count duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Drop duplicates
df.drop_duplicates(inplace=True)
print(f"Shape after removing duplicates: {df.shape}")


Number of duplicate rows: 0
Shape after removing duplicates: (2938, 22)


In [8]:
# Function to remove outliers using IQR
def remove_outliers(df, numeric_cols):
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

df = remove_outliers(df, num_cols)

print(f"Shape after outlier removal: {df.shape}")


Shape after outlier removal: (626, 22)


In [10]:
# Drop irrelevant features
df = df.drop(columns=['Country'], errors='ignore')

# Check final columns
print(df.columns)


Index(['Year', 'Status', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')


In [11]:
# One-Hot Encoding for 'Status'
df_onehot = pd.get_dummies(df, columns=['Status'], drop_first=True)
print(df_onehot.head())


    Year  Life expectancy   Adult Mortality  infant deaths  Alcohol  \
16  2015              77.8             74.0              0     4.60   
17  2014              77.5              8.0              0     4.51   
18  2013              77.2             84.0              0     4.76   
19  2012              76.9             86.0              0     5.14   
20  2011              76.6             88.0              0     5.37   

    percentage expenditure  Hepatitis B  Measles    BMI   under-five deaths   \
16              364.975229         99.0         0   58.0                   0   
17              428.749067         98.0         0   57.2                   1   
18              430.876979         99.0         0   56.5                   1   
19              412.443356         99.0         9   55.8                   1   
20              437.062100         99.0        28   55.1                   1   

    ...  Total expenditure  Diphtheria    HIV/AIDS          GDP  Population  \
16  ...      

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Status_LE'] = le.fit_transform(df['Status'])
print(df[['Status', 'Status_LE']].head())


        Status  Status_LE
16  Developing          1
17  Developing          1
18  Developing          1
19  Developing          1
20  Developing          1


In [13]:
from sklearn.preprocessing import OrdinalEncoder

# Define order for 'Status'
encoder = OrdinalEncoder(categories=[['Developing', 'Developed']])
df['Status_OE'] = encoder.fit_transform(df[['Status']])
print(df[['Status', 'Status_OE']].head())


        Status  Status_OE
16  Developing        0.0
17  Developing        0.0
18  Developing        0.0
19  Developing        0.0
20  Developing        0.0


In [15]:
# Frequency Encoding for Status
freq_map = df['Status'].value_counts(normalize=True)
df['Status_FE'] = df['Status'].map(freq_map)
print(df[['Status', 'Status_FE']].head())


        Status Status_FE
16  Developing  0.773163
17  Developing  0.773163
18  Developing  0.773163
19  Developing  0.773163
20  Developing  0.773163


In [23]:
df.columns = df.columns.str.strip()  # Removes leading/trailing spaces


In [25]:
#target encoding
target_mean = df.groupby('Status', observed=True)['Life expectancy'].mean()
df['Status_TE'] = df['Status'].map(target_mean)
print(df[['Status', 'Status_TE']].head())


        Status  Status_TE
16  Developing  73.255785
17  Developing  73.255785
18  Developing  73.255785
19  Developing  73.255785
20  Developing  73.255785


In [27]:
# feature_scaling.py

import pandas as pd
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, Normalizer, StandardScaler

# Load encoded dataset
df = pd.read_csv("/content/Life Expectancy Data.csv")

# Select numeric columns for scaling
numeric_cols = df.select_dtypes(include='number').columns.tolist()
print("Numeric columns to scale:", numeric_cols)



Numeric columns to scale: ['Year', 'Life expectancy ', 'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling']


In [29]:
minmax_scaler = MinMaxScaler()
df_minmax = df.copy()
df_minmax[numeric_cols] = minmax_scaler.fit_transform(df_minmax[numeric_cols])
df_minmax.to_csv("life_expectancy_minmax.csv", index=False)
print("\nMin-Max Scaled (first 5 rows):")
print(df_minmax[numeric_cols].head())



Min-Max Scaled (first 5 rows):
       Year  Life expectancy   Adult Mortality  infant deaths  Alcohol  \
0  1.000000          0.544592         0.362881       0.034444      0.0   
1  0.933333          0.447818         0.373961       0.035556      0.0   
2  0.866667          0.447818         0.369806       0.036667      0.0   
3  0.800000          0.440228         0.375346       0.038333      0.0   
4  0.733333          0.434535         0.379501       0.039444      0.0   

   percentage expenditure  Hepatitis B  Measles       BMI   \
0                0.003659     0.653061  0.005439  0.209733   
1                0.003774     0.622449  0.002319  0.203940   
2                0.003759     0.642857  0.002027  0.198146   
3                0.004014     0.673469  0.013135  0.192352   
4                0.000364     0.683673  0.014200  0.187717   

   under-five deaths      Polio  Total expenditure  Diphtheria    HIV/AIDS  \
0              0.0332  0.031250           0.452118     0.649485        0

In [30]:
# 4.2 Max-Abs Scaling
maxabs_scaler = MaxAbsScaler()
df_maxabs = df.copy()
df_maxabs[numeric_cols] = maxabs_scaler.fit_transform(df_maxabs[numeric_cols])
df_maxabs.to_csv("life_expectancy_maxabs.csv", index=False)
print("\nMax-Abs Scaled (first 5 rows):")
print(df_maxabs[numeric_cols].head())


Max-Abs Scaled (first 5 rows):
       Year  Life expectancy   Adult Mortality  infant deaths  Alcohol  \
0  1.000000          0.730337         0.363762       0.034444  0.00056   
1  0.999504          0.673034         0.374827       0.035556  0.00056   
2  0.999007          0.673034         0.370678       0.036667  0.00056   
3  0.998511          0.668539         0.376210       0.038333  0.00056   
4  0.998015          0.665169         0.380360       0.039444  0.00056   

   percentage expenditure  Hepatitis B  Measles       BMI   \
0                0.003659     0.656566  0.005439  0.218786   
1                0.003774     0.626263  0.002319  0.213058   
2                0.003759     0.646465  0.002027  0.207331   
3                0.004014     0.676768  0.013135  0.201604   
4                0.000364     0.686869  0.014200  0.197022   

   under-five deaths      Polio  Total expenditure  Diphtheria    HIV/AIDS  \
0              0.0332  0.060606           0.463636     0.656566   0.0019

In [39]:
# 4.3 Vector Normalization (L2)

# Make a copy of the dataset
df_normalized = df.copy()

# Select numeric columns
numeric_cols = df_normalized.select_dtypes(include='number').columns.tolist()

# Fill all NaN values in numeric columns (using 0 or median)
df_normalized[numeric_cols] = df_normalized[numeric_cols].fillna(0)
# OR: df_normalized[numeric_cols] = df_normalized[numeric_cols].fillna(df_normalized[numeric_cols].median())

# Apply L2 normalization
from sklearn.preprocessing import Normalizer
normalizer = Normalizer(norm='l2')
df_normalized[numeric_cols] = normalizer.fit_transform(df_normalized[numeric_cols])

# Save normalized dataset
df_normalized.to_csv("life_expectancy_normalized.csv", index=False)

# Print first 5 rows
print("\nVector Normalized (L2) (first 5 rows):")
print(df_normalized[numeric_cols].head())



Vector Normalized (L2) (first 5 rows):
       Year  Life expectancy   Adult Mortality  infant deaths       Alcohol  \
0  0.000060          0.000002         0.000008       0.000002  2.964149e-10   
1  0.006148          0.000183         0.000827       0.000195  3.052603e-08   
2  0.000063          0.000002         0.000008       0.000002  3.151424e-10   
3  0.000544          0.000016         0.000074       0.000019  2.704925e-09   
4  0.000675          0.000020         0.000092       0.000024  3.357281e-09   

   percentage expenditure  Hepatitis B  Measles           BMI   \
0                0.000002     0.000002  0.000034  5.661525e-07   
1                0.000224     0.000189  0.001502  5.677841e-05   
2                0.000002     0.000002  0.000014  5.704077e-07   
3                0.000021     0.000018  0.000754  4.760669e-06   
4                0.000002     0.000023  0.001012  5.774523e-06   

   under-five deaths          Polio  Total expenditure  Diphtheria   \
0            0.00

In [35]:
# 4.4 Z-score Standardization
scaler = StandardScaler()
df_standardized = df.copy()
df_standardized[numeric_cols] = scaler.fit_transform(df_standardized[numeric_cols])
df_standardized.to_csv("life_expectancy_standardized.csv", index=False)
print("\nZ-score Standardized (first 5 rows):")
print(df_standardized[numeric_cols].head())




Z-score Standardized (first 5 rows):
       Year  Life expectancy   Adult Mortality  infant deaths   Alcohol  \
0  1.621762         -0.443691         0.790238       0.268824 -1.133571   
1  1.404986         -0.979279         0.854614       0.285786 -1.133571   
2  1.188210         -0.979279         0.830473       0.302749 -1.133571   
3  0.971434         -1.021286         0.862660       0.328193 -1.133571   
4  0.754658         -1.052791         0.886801       0.345155 -1.133571   

   percentage expenditure  Hepatitis B  Measles       BMI   \
0               -0.335570    -0.635971 -0.110384 -0.959116   
1               -0.334441    -0.755661 -0.168124 -0.984066   
2               -0.334594    -0.675868 -0.173531 -1.009015   
3               -0.332096    -0.556178  0.032045 -1.033964   
4               -0.367862    -0.516281  0.051757 -1.053924   

   under-five deaths      Polio  Total expenditure  Diphtheria    HIV/AIDS  \
0            0.255359 -3.268019           0.889486    -0.730

In [40]:
from sklearn.model_selection import train_test_split

# Make sure column names are stripped
df.columns = df.columns.str.strip()

# Target column
target = 'Life expectancy'

# Feature columns (example: all numeric columns except target)
numeric_cols = df.select_dtypes(include='number').columns.tolist()
features = [col for col in numeric_cols if col != target]

# Features (X) and target (y)
X = df[features]
y = df[[target]]

# Train/Test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Print shapes
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


Training set shape: (2350, 19) (2350, 1)
Testing set shape: (588, 19) (588, 1)
