In [None]:
# -------- INFO --------
"""
Repository: https://github.com/NLBrien/mod550-2025
Creation date: 2025-10-14
Author: Nathan L.Brien
Course: MOD550 - Machine Learning
Title: Semester project
Description:    Running Random Forest and Logistic Regression
                to determine heaviest feature from dataset.
                All information and code are from lectures and exercises from the 2nd part of the course.

Last modification date: 2025-10-14
"""


'\nRepository: https://github.com/NLBrien/mod550-2025\nCreation date: 2025-10-14\nAuthor: Nathan L.Brien\nCourse: MOD550 - Machine Learning\nTitle: Semester project\nDescription:    Running Random Forest and Logistic Regression\n                to determine heaviest feature from dataset\n\nLast modification date: 2025-10-14\n'

In [9]:
# -------- LIBRARIES --------

## BASIC LIBRARIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

## RANDOM FOREST (RF) IMPORT
from sklearn.ensemble import RandomForestClassifier

## LOGISTIC REGRESSION (LogReg) IMPORT
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc, RocCurveDisplay

## DATA IMPORT
df = pd.read_csv(r"C:\Users\natha\UiS - MOD550\mod550-2025\MOD550-P1-NLB-datasets\MOD550-P1-NLB-data_merged.csv")
### Preview data
df.head()


Unnamed: 0,Country Code,Countries/territories,Region,Year,Population group selected,Population analysed,% pop analysed of tot country pop,Population in Phase 3 or above #,Population in Phase 3 or above %,Population in Phase 1 #,...,Population in Phase 4 %,Population in Phase 5 #,Population in Phase 5 %,Major Food Crisis,Primary driver,Secondary driver,Tertiary driver,Total country population,Total country GDP (US$),GDP per capita (US$)
0,AFG,Afghanistan,ASIA,2016.0,Whole country,26490799.0,0.79,8458268.0,0.31,13280564.0,...,0.06,0.0,0.0,Y,Conflict/insecurity,,,34700612.0,18116570000.0,522.082216
1,AGO,Angola,CS AFRICA,2016.0,,12800000.0,0.455067,76000.0,0.005938,11968000.0,...,,,,N,Weather extremes,,,29183070.0,52761620000.0,1807.952941
2,ALB,Albania,MENA,2016.0,,,,,,,...,,,,,,,,2876101.0,11988670000.0,4168.375445
3,ARG,Argentina,LAC,2016.0,,,,,,,...,,,,,,,,43900313.0,557532300000.0,12699.962314
4,ARM,Armenia,MENA,2016.0,,,,,,,...,,,,,,,,2992300.0,10546140000.0,3524.424769


In [10]:
# -------- DATA REGULARIZATION --------

## Assign random constant for reproducibility
RSEED = 44
np.random.seed(RSEED)

## Define test size percentage (20%)
test_percent = 0.20

## Turn empty strings into NaN values
df = df.replace(r'^\s*$', np.nan, regex=True)

## Remove Empty rows from target value "Major Food Crisis"
df = df.dropna(subset=["Major Food Crisis"]).reset_index(drop=True)
### Preview 
print(df["Major Food Crisis"].unique())

## Turning into binary classification
"""
"Major Food Crisis" column values:
0 = No Crisis (N)
1 = Crisis (Y)
"""
df["Major Food Crisis"] = df["Major Food Crisis"].map({
    "N": 0,
    "Y": 1
})

## Force target value type
y_binary = df["Major Food Crisis"].astype(int)
### Preview overall target value counts
print(y_binary.value_counts())

## Define features matrix (drop target column)
X = pd.get_dummies(
    df.drop(columns = ["Major Food Crisis"]),
    drop_first = True
)

## Force numeric dtype (handles accidental string numerics)
X = X.apply(pd.to_numeric, errors = "coerce")

## Turn feature matrix empty values (NaN) into column mean
"""
Initial used function:
    df_filled = df.fillna(df.mean())
this function could not be run because the data is too sparse and inconsistant.

Use of AI (Microsoft Copilot version 1.25091.124.0) to impliment debugging solution from:
    https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
"""
imputer = SimpleImputer(strategy = "mean")
X_imputed = imputer.fit_transform(X)

## Force dop any remaining NaN values to avoid error
X = X.dropna(axis=1)

## Create new clean data copy
df_clean = df.copy()
### Preview new data head
print(df_clean.head())

['Y' 'N']
Major Food Crisis
1    346
0    153
Name: count, dtype: int64
  Country Code Countries/territories       Region    Year  \
0          AFG           Afghanistan         ASIA  2016.0   
1          AGO                Angola    CS AFRICA  2016.0   
2          BDI               Burundi  EAST AFRICA  2016.0   
3          BFA          Burkina Faso  WEST AFRICA  2016.0   
4          BGD            Bangladesh         ASIA  2016.0   

  Population group selected  Population analysed  \
0             Whole country           26490799.0   
1                       NaN           12800000.0   
2             Whole country            9351450.0   
3             Whole country           18936014.0   
4                  Refugees            3700000.0   

   % pop analysed of tot country pop  Population in Phase 3 or above #  \
0                           0.790000                      8.458268e+06   
1                           0.455067                      7.600000e+04   
2                         

In [None]:
# -------- LOGISTIC REGRESSION (LogReg) --------

## Train-test split for binary classification
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_binary,
    test_size = test_percent,
    random_state = RSEED,
    stratify = y_binary
)
print(f"Training set size: {(1 - test_percent) * 100} %")
print(f"Train set distribution:\n{y_train.value_counts()}")
print(f"Test set size: {test_percent * 100} %")
print(f"Test set distribution:\n{y_test.value_counts()}")

## Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Initialize logistic regression model
LogReg = LogisticRegression(max_iter=1000, random_state=RSEED)

## Train the model
LogReg.fit(X_train_scaled, y_train)

## Model predictions
y_pred_logreg = LogReg.predict(X_test_scaled)
y_prob_logreg = LogReg.predict_proba(X_test_scaled)[:, 1]


Training set size: 80.0 %
Train set distribution:
Major Food Crisis
1    277
0    122
Name: count, dtype: int64
Test set size: 20.0 %
Test set distribution:
Major Food Crisis
1    69
0    31
Name: count, dtype: int64


In [12]:
# -------- METRICS & EVALUATION (LogReg) --------

## Accuracy
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression Accuracy: {:.2f}%".format(accuracy_logreg * 100))

## Classification Report
print("Classification Report (LogReg):")
print(classification_report(
    y_test,
    y_pred_logreg,
    target_names=["No Food Crisis (0)", "Food Crisis (1)"]
))

# -------- LOG-ODDS COEFFICIENTS --------

## Coefficients represent log-odds contribution of each feature
coef_df = pd.DataFrame(LogReg.coef_, columns = X.columns, index=["Crisis"])
print("Logistic Regression Coefficients (log-odds):")
print(coef_df)

## Log-odds coefficients ranking
coef_ranking = coef_df.T.sort_values(by = "Crisis", ascending = False)
print(f"Log-Odds Coefficients Ranking (Most Positive to Most Negative): \n{coef_ranking}")


Logistic Regression Accuracy: 85.00%
Classification Report (LogReg):
                    precision    recall  f1-score   support

No Food Crisis (0)       0.81      0.68      0.74        31
   Food Crisis (1)       0.86      0.93      0.90        69

          accuracy                           0.85       100
         macro avg       0.84      0.80      0.82       100
      weighted avg       0.85      0.85      0.85       100

Logistic Regression Coefficients (log-odds):
            Year  Population analysed  Population in Phase 3 or above #  \
Crisis  0.256222             0.292867                          0.398037   

        Population in Phase 3 or above %  Country Code_AGO  Country Code_BDI  \
Crisis                          0.479741          0.023972          0.015685   

        Country Code_BEN  Country Code_BFA  Country Code_BGD  \
Crisis         -0.064729          0.018377          0.046335   

        Country Code_CAF  ...  Secondary driver_Crop pests  \
Crisis          0.08

In [None]:
# -------- RANDOM FOREST (RF) --------

# Train/test split (stratified to preserve class ratios)
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(
    X, y_binary, test_size=0.25, random_state = RSEED, stratify = y_binary
)

print(f"\nTrain size: {len(X_train_rf)}   Test size: {len(X_test_rf)}")
print(f"Training set %: {len(X_train_rf) / (len(X_train_rf) + len(X_test_rf)) * 100:.2f}")

# Model
clf = RandomForestClassifier(
    n_estimators = 100,
    random_state = RSEED,
    n_jobs = -1
)

# Fit & predict
clf.fit(X_train_rf, y_train_rf)
y_pred_rf = clf.predict(X_test_rf)



Train size: 374   Test size: 125
Training set %: 74.95


In [None]:
# -------- METRICS & EVALUATION (RF) --------

# Metrics
acc = accuracy_score(y_test_rf, y_pred_rf)
print(f"Random Forest Accuracy: {acc:.3f}")

# Feature importance (sorted)
fi = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Feature importances:")
print(fi)



Random Forest Accuracy: 0.888

Feature importances:
Population in Phase 3 or above #               0.193263
Population in Phase 3 or above %               0.124560
Population analysed                            0.068586
Year                                           0.030265
Region_WEST AFRICA                             0.029825
                                                 ...   
Population in Phase 1 %_0.4364796102421011     0.000000
Population in Phase 1 %_0.45617035223061864    0.000000
Population in Phase 1 %_0.45617178740637193    0.000000
Population in Phase 1 %_0.45803834582368497    0.000000
Population in Phase 1 %_0.35252728120892257    0.000000
Length: 1212, dtype: float64
