In [1]:
"""
# Analyzing Poverty in Honduras

In this notebook, we will analyze the poverty levels in Honduras using multiple indicators such as economic growth, tourism, education spending, and more. We aim to develop a predictive model to estimate the "Share below $1 a day" based on these indicators.
"""

'\n# Analyzing Poverty in Honduras\n\nIn this notebook, we will analyze the poverty levels in Honduras using multiple indicators such as economic growth, tourism, education spending, and more. We aim to develop a predictive model to estimate the "Share below $1 a day" based on these indicators.\n'

In [2]:
import pandas as pd

# Load and merge datasets

datasets = {
    "poverty": r'C:\Users\JFB\Documents\Pythoncsv\Personal\populationinpoverty.csv',
    "economy": r'C:\Users\JFB\Documents\Pythoncsv\Personal\economicgrowth.csv',
    "tourism": r'C:\Users\JFB\Documents\Pythoncsv\Personal\tourismtrips.csv',
    "education": r'C:\Users\JFB\Documents\Pythoncsv\Personal\educationspending.csv',
    "homicides": r'C:\Users\JFB\Documents\Pythoncsv\Personal\homicides.csv',
    "electricity": r'C:\Users\JFB\Documents\Pythoncsv\Personal\electricityaccess.csv',
    "electricityaccess": r'C:\Users\JFB\Documents\Pythoncsv\Personal\accesstoelectricity.csv',
    "agricultural": r'C:\Users\JFB\Documents\Pythoncsv\Personal\agriculturalland.csv',
    "govspending": r'C:\Users\JFB\Documents\Pythoncsv\Personal\gdpgovexpenditure.csv',
    "healthspend": r'C:\Users\JFB\Documents\Pythoncsv\Personal\healthspending.csv',
    "internetuse": r'C:\Users\JFB\Documents\Pythoncsv\Personal\internetuse.csv',
    "undernourishment": r'C:\Users\JFB\Documents\Pythoncsv\Personal\prevalenceofundernourishment.csv',
    "renewables": r'C:\Users\JFB\Documents\Pythoncsv\Personal\renewableelectricity.csv',
    "socspending": r'C:\Users\JFB\Documents\Pythoncsv\Personal\socialspending.csv'
}

honduras_data = pd.read_csv(datasets["poverty"])
honduras_data = honduras_data[honduras_data['Entity'] == 'Honduras']

for key, csv_path in datasets.items():
    if key != "poverty":
        temp_data = pd.read_csv(csv_path)
        temp_data = temp_data[temp_data['Entity'] == 'Honduras']
        honduras_data = honduras_data.merge(temp_data, on=['Year', 'Entity'], how='outer')

honduras_data.head()

Unnamed: 0,Entity,Year,Share below $1 a day,Share below $2.15 a day,GDP per capita,tourists,education spending,homicides,urban access to electricity percentage,rural access to electricity percentage,access to electricity,Agricultural land hectares,Expense (% of GDP),health spending,Individuals using the Internet (% of population),Prevalence of undernourishment (% of population),Renewable electricity share,Social Expenditure GDP Percentage
0,Honduras,1989,16.559424,39.959561,3166.6462,,,,,,,3371000.0,,,,,,
1,Honduras,1990,18.560977,41.997452,3253.6936,,,495.5328,,,,3320000.0,,,0.0,,,
2,Honduras,1991,9.142681,30.774723,3317.5164,,,703.7932,88.70062,31.257088,54.781937,3342000.0,,,0.0,,,
3,Honduras,1992,6.505092,26.968343,3328.1123,,,955.10614,89.851265,37.0125,58.913113,3355000.0,,,0.0,,,
4,Honduras,1993,5.668213,21.429638,3499.39,,,1252.5227,92.08252,38.717102,61.100155,3548000.0,,,0.0,,,


In [3]:
from sklearn.model_selection import train_test_split

# Prepare the data
y = honduras_data['Share below $1 a day'] # change this to your target column name if different
X = honduras_data.drop(columns=['Share below $1 a day', 'Entity', 'Year']) # drop target and non-numeric columns

# Handle NaN values - fill with the mean
X.fillna(X.mean(), inplace=True)

X.head(), y.head()

(   Share below $2.15 a day  GDP per capita       tourists  education spending  \
 0                39.959561       3166.6462  644703.703704           20.538755   
 1                41.997452       3253.6936  644703.703704           20.538755   
 2                30.774723       3317.5164  644703.703704           20.538755   
 3                26.968343       3328.1123  644703.703704           20.538755   
 4                21.429638       3499.3900  644703.703704           20.538755   
 
      homicides  urban access to electricity percentage  \
 0  3764.324146                               96.105879   
 1   495.532800                               96.105879   
 2   703.793200                               88.700620   
 3   955.106140                               89.851265   
 4  1252.522700                               92.082520   
 
    rural access to electricity percentage  access to electricity  \
 0                               54.492805              74.276948   
 1          

In [7]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
honduras_data[factors] = imputer.fit_transform(honduras_data[factors])

NameError: name 'factors' is not defined

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and calculate MSE
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

In [None]:
import matplotlib.pyplot as plt

# Visualizing the trends for each factor in Honduras
factors = list(X.columns)  # using the columns in X as the factors

for factor in factors:
    plt.figure(figsize=(10, 6))
    plt.plot(honduras_data['Year'], honduras_data[factor])
    plt.title(f'Trend of {factor} in Honduras over the years')
    plt.xlabel('Year')
    plt.ylabel(factor)
    plt.grid(True)
    plt.show()

In [None]:
comparison_countries = ["El Salvador", "Guatemala", "Nicaragua"]

# Load data for the countries
dataframes = {country: pd.read_csv(datasets["poverty"]) for country in comparison_countries}
for country, df in dataframes.items():
    for key, csv_path in datasets.items():
        if key != "poverty":
            temp_data = pd.read_csv(csv_path)
            temp_data = temp_data[temp_data['Entity'] == country]
            df = df.merge(temp_data, on=['Year', 'Entity'], how='outer')
    dataframes[country] = df

# Visualizing the 'Share below $1 a day' trend for comparison countries
plt.figure(figsize=(12, 8))
plt.plot(honduras_data['Year'], honduras_data['Share below $1 a day'], label='Honduras', linewidth=2)
for country, df in dataframes.items():
    plt.plot(df['Year'], df['Share below $1 a day'], label=country)
plt.title('Poverty Trend (Share below $1 a day) over the years in select countries')
plt.xlabel('Year')
plt.ylabel('Share below $1 a day (%)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
correlations = honduras_data[factors + ['Share below $1 a day']].corr()
poverty_correlations = correlations['Share below $1 a day'].sort_values()

# Displaying the correlations
print(poverty_correlations)