In [3]:
pip install pandas

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd

# Load the Excel file
df = pd.read_csv('Global_Food_Security_Index.csv')

lpi_data = df[['Rank', 'Country']]
print(lpi_data)
lpi_data.to_csv("analyzed_food_security_by_country.csv", index=False)

       Rank       Country
0       1st       Finland
1       2nd       Ireland
2       3rd        Norway
3       4th        France
4       5th   Netherlands
..      ...           ...
108  =108th    Madagascar
109   110th  Sierra Leone
110   111th         Yemen
111   112th         Haiti
112   113th         Syria

[113 rows x 2 columns]


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

# Load the CSV files
csi_df = pd.read_csv('analyzed_climate_stress_by_country.csv')
fseci_df = pd.read_csv('analyzed_FSECI_by_country.csv')
lpi_df = pd.read_csv('analyzedLPI_by_country.csv')
rank_df = pd.read_csv('analyzed_food_security_by_country.csv')

# Clean column names (remove extra spaces)
csi_df.columns = csi_df.columns.str.strip()
fseci_df.columns = fseci_df.columns.str.strip()
lpi_df.columns = lpi_df.columns.str.strip()
rank_df.columns = rank_df.columns.str.strip()

# Standardize country names for merging
def clean_country_name(name):
    return name.strip().lower()

csi_df['Country_clean'] = csi_df['Country'].apply(clean_country_name)
fseci_df['Country_clean'] = fseci_df['Country'].apply(clean_country_name)
rank_df['Country_clean'] = rank_df['Country'].apply(clean_country_name)

# For LPI, the country column might be named 'Economy'
if 'Economy' in lpi_df.columns:
    lpi_df['Country_clean'] = lpi_df['Economy'].apply(clean_country_name)
else:
    lpi_df['Country_clean'] = lpi_df['Country'].apply(clean_country_name)

# Merge all datasets
merged_df = rank_df[['Country', 'Country_clean', 'Rank']].copy()
merged_df = merged_df.merge(csi_df[['Country_clean', 'Climate Stress Index']], 
                             on='Country_clean', how='inner')
merged_df = merged_df.merge(fseci_df[['Country_clean', 'FSECI']], 
                             on='Country_clean', how='inner')
merged_df = merged_df.merge(lpi_df[['Country_clean', 'LPI Score']], 
                             on='Country_clean', how='inner')

# Remove the temporary clean column
merged_df = merged_df.drop('Country_clean', axis=1)

print(f"Total countries with complete data: {len(merged_df)}")
print("\nFirst few rows:")
print(merged_df.head())

# Check for missing values
print("\nMissing values:")
print(merged_df.isnull().sum())

# Remove any rows with missing values
merged_df = merged_df.dropna()

# Prepare features (X) and target (y)
X = merged_df[['LPI Score', 'FSECI', 'Climate Stress Index']]
y = merged_df['Rank']

print("\nFeature statistics:")
print(X.describe())

# Split data into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Create and train Random Forest model
rf_model = RandomForestRegressor(
    n_estimators=100,      # Number of trees
    max_depth=10,          # Maximum depth of trees
    min_samples_split=5,   # Minimum samples required to split
    min_samples_leaf=2,    # Minimum samples required at leaf node
    random_state=42,
    n_jobs=-1              # Use all CPU cores
)

print("\nTraining Random Forest model...")
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Calculate metrics
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("\n" + "="*50)
print("MODEL PERFORMANCE")
print("="*50)
print(f"\nTraining Set:")
print(f"  R² Score: {train_r2:.4f}")
print(f"  RMSE: {train_rmse:.2f}")
print(f"  MAE: {train_mae:.2f}")

print(f"\nTest Set:")
print(f"  R² Score: {test_r2:.4f}")
print(f"  RMSE: {test_rmse:.2f}")
print(f"  MAE: {test_mae:.2f}")

# Cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=5, 
                            scoring='r2', n_jobs=-1)
print(f"\n5-Fold Cross-Validation R² Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n" + "="*50)
print("FEATURE IMPORTANCE")
print("="*50)
print(feature_importance)

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Feature Importance
axes[0, 0].barh(feature_importance['Feature'], feature_importance['Importance'])
axes[0, 0].set_xlabel('Importance')
axes[0, 0].set_title('Feature Importance')
axes[0, 0].invert_yaxis()

# 2. Actual vs Predicted (Test Set)
axes[0, 1].scatter(y_test, y_test_pred, alpha=0.6)
axes[0, 1].plot([y_test.min(), y_test.max()], 
                [y_test.min(), y_test.max()], 
                'r--', lw=2)
axes[0, 1].set_xlabel('Actual Rank')
axes[0, 1].set_ylabel('Predicted Rank')
axes[0, 1].set_title(f'Actual vs Predicted (Test Set)\nR² = {test_r2:.4f}')
axes[0, 1].grid(True, alpha=0.3)

# 3. Residuals Plot
residuals = y_test - y_test_pred
axes[1, 0].scatter(y_test_pred, residuals, alpha=0.6)
axes[1, 0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1, 0].set_xlabel('Predicted Rank')
axes[1, 0].set_ylabel('Residuals')
axes[1, 0].set_title('Residual Plot')
axes[1, 0].grid(True, alpha=0.3)

# 4. Error Distribution
axes[1, 1].hist(residuals, bins=20, edgecolor='black', alpha=0.7)
axes[1, 1].axvline(x=0, color='r', linestyle='--', lw=2)
axes[1, 1].set_xlabel('Prediction Error')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Distribution of Prediction Errors')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('rf_model_results.png', dpi=300, bbox_inches='tight')
print("\nVisualization saved as 'rf_model_results.png'")

# Create detailed predictions dataframe
test_results = pd.DataFrame({
    'Country': merged_df.loc[y_test.index, 'Country'].values,
    'Actual_Rank': y_test.values,
    'Predicted_Rank': y_test_pred,
    'Error': np.abs(y_test.values - y_test_pred),
    'LPI_Score': X_test['LPI Score'].values,
    'FSECI': X_test['FSECI'].values,
    'Climate_Stress_Index': X_test['Climate Stress Index'].values
})

test_results = test_results.sort_values('Actual_Rank')

print("\n" + "="*50)
print("TEST SET PREDICTIONS (sorted by actual rank)")
print("="*50)
print(test_results.to_string(index=False))

# Save predictions to CSV
test_results.to_csv('predictions.csv', index=False)
print("\nPredictions saved to 'predictions.csv'")

# Correlation analysis
print("\n" + "="*50)
print("CORRELATION ANALYSIS")
print("="*50)
correlation_df = merged_df[['Rank', 'LPI Score', 'FSECI', 'Climate Stress Index']].corr()
print(correlation_df)

# Create correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_df, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
print("\nCorrelation matrix saved as 'correlation_matrix.png'")

plt.show()

print("="*50)
print("ANALYSIS COMPLETE!")
print("="*50)

Total countries with complete data: 82

First few rows:
       Country Rank  Climate Stress Index      FSECI  LPI Score
0      Finland    1              0.448973  59.695265        4.2
1      Ireland    2              0.527122  58.139089        3.6
2       Norway    3              0.480954  57.615069        3.7
3       France    4              0.547296  61.050421        3.9
4  Netherlands    5              0.537096  58.373274        4.1

Missing values:
Country                 0
Rank                    0
Climate Stress Index    0
FSECI                   0
LPI Score               0
dtype: int64

Feature statistics:
       LPI Score      FSECI  Climate Stress Index
count  82.000000  82.000000             82.000000
mean    3.174390  56.748611              0.505519
std     0.590001   3.269983              0.027558
min     2.100000  41.669479              0.448973
25%     2.700000  55.134345              0.492047
50%     3.200000  57.068107              0.505130
75%     3.700000  58.370608  

ValueError: could not convert string to float: '=14'