# **Feature Engineering**

## **Loading Libraries**

In [65]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_regression

### **Loading the Processed Dataset**

In [66]:
wpf = pd.read_csv("/workspaces/Global-Population-Growth-EDA-and-Prediction/data/processed/cleaned_population_growth.csv")

In [67]:
wpf.head()

Unnamed: 0,City,Country,Continent,Population (2024),Population (2023),Growth Rate,Population Change
0,Tokyo,Japan,Asia,1.0,1.0,0.102334,-79070
1,Delhi,India,Asia,0.909044,0.883378,0.612208,866094
2,Shanghai,China,Asia,0.800712,0.781078,0.543986,657110
3,Dhaka,Bangladesh,Asia,0.637581,0.616511,0.701975,726036
4,Sao Paulo,Brazil,South America,0.606536,0.600335,0.289048,186968


## **Feature Engineering**

### **Creating New Features**

In [68]:
wpf["Population Change"] = wpf["Population (2024)"] - wpf["Population (2023)"]
wpf["Growth Rate per 1000"] = wpf["Growth Rate"] * 1000

### **Encoding Categorical Variables**

In [71]:
X = wpf.drop(columns=['Population (2024)'])
y = wpf['Population (2024)']

categorical_columns = ['City', 'Country', 'Continent']

encoder = OneHotEncoder(sparse_output=False, drop='first')

X_encoded = encoder.fit_transform(X[categorical_columns])
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(categorical_columns))

X_numeric = X.drop(columns=categorical_columns)
X_combined = pd.concat([X_numeric.reset_index(drop=True), X_encoded_df.reset_index(drop=True)], axis=1)

print("Encoded Features:")
print(X_combined.head())

Encoded Features:
   Population (2023)  Growth Rate  Population Change  Growth Rate per 1000  \
0           1.000000     0.102334           0.000000            102.333932   
1           0.883378     0.612208           0.025666            612.208259   
2           0.781078     0.543986           0.019633            543.985637   
3           0.616511     0.701975           0.021070            701.974865   
4           0.600335     0.289048           0.006201            289.048474   

   City_Abidjan  City_Abu Dhabi  City_Acapulco De Juarez  City_Accra  \
0           0.0             0.0                      0.0         0.0   
1           0.0             0.0                      0.0         0.0   
2           0.0             0.0                      0.0         0.0   
3           0.0             0.0                      0.0         0.0   
4           0.0             0.0                      0.0         0.0   

   City_Ad Dammam  City_Adana  ...  Country_Yemen  Country_Zambia  \
0          

### **Scaling Features**

In [72]:
scaler = MinMaxScaler()
wpf[['Population (2024)', 'Population (2023)', 'Growth Rate', 'Population Change']] = scaler.fit_transform(
    wpf[['Population (2024)', 'Population (2023)', 'Growth Rate', 'Population Change']])

## **Feature Selection**

### **Selecting the Best Features**

In [73]:
selector = SelectKBest(score_func=f_regression, k=5)

X_new = selector.fit_transform(X_combined, y)

selected_indices = selector.get_support(indices=True)

selected_features = X_combined.columns[selected_indices]

print("Selected Features:", selected_features)

Selected Features: Index(['Population (2023)', 'Population Change', 'City_Delhi', 'City_Shanghai',
       'City_Tokyo'],
      dtype='object')


### **Create a New DataFrame with Selected Features**

In [75]:
df_selected = X_combined[selected_features].copy()  
df_selected['Population (2024)'] = y

df_selected

Unnamed: 0,Population (2023),Population Change,City_Delhi,City_Shanghai,City_Tokyo,Population (2024)
0,1.000000,0.000000,0.0,0.0,1.0,1.000000
1,0.883378,0.025666,1.0,0.0,0.0,0.909044
2,0.781078,0.019633,0.0,1.0,0.0,0.800712
3,0.616511,0.021070,0.0,0.0,0.0,0.637581
4,0.600335,0.006201,0.0,0.0,0.0,0.606536
...,...,...,...,...,...,...
772,0.000260,-0.000220,0.0,0.0,0.0,0.000040
773,0.000500,-0.000465,0.0,0.0,0.0,0.000035
774,0.000000,0.000035,0.0,0.0,0.0,0.000035
775,0.000397,-0.000393,0.0,0.0,0.0,0.000004


## **Saving the Engineered Data**

### **Saving the Feature-Engineered Data**

In [76]:
df_selected.to_csv('/workspaces/Global-Population-Growth-EDA-and-Prediction/data/processed/final_features.csv', index=False)