In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# Using the provided datasets dictionary
datasets = {
    "poverty": r'C:\Users\JFB\Documents\Pythoncsv\Personal\populationinpoverty.csv',
    "economy": r'C:\Users\JFB\Documents\Pythoncsv\Personal\economicgrowth.csv',
    "tourism": r'C:\Users\JFB\Documents\Pythoncsv\Personal\tourismtrips.csv',
    "education": r'C:\Users\JFB\Documents\Pythoncsv\Personal\educationspending.csv',
    "homicides": r'C:\Users\JFB\Documents\Pythoncsv\Personal\homicides.csv',
    "electricity": r'C:\Users\JFB\Documents\Pythoncsv\Personal\electricityaccess.csv',
    "electricityaccess": r'C:\Users\JFB\Documents\Pythoncsv\Personal\accesstoelectricity.csv',
    "agricultural": r'C:\Users\JFB\Documents\Pythoncsv\Personal\agriculturalland.csv',
    "govspending": r'C:\Users\JFB\Documents\Pythoncsv\Personal\gdpgovexpenditure.csv',
    "healthspend": r'C:\Users\JFB\Documents\Pythoncsv\Personal\healthspending.csv',
    "internetuse": r'C:\Users\JFB\Documents\Pythoncsv\Personal\internetuse.csv',
    "undernourishment": r'C:\Users\JFB\Documents\Pythoncsv\Personal\prevalenceofundernourishment.csv',
    "renewables": r'C:\Users\JFB\Documents\Pythoncsv\Personal\renewableelectricity.csv',
    "socspending": r'C:\Users\JFB\Documents\Pythoncsv\Personal\socialspending.csv'
}

# Load all datasets and filter for Honduras
honduras_datasets = {key: pd.read_csv(path).query("Entity == 'Honduras'") for key, path in datasets.items()}

# Merge all the datasets based on 'Entity' and 'Year'
merged_data = honduras_datasets["poverty"]
for key, df in honduras_datasets.items():
    if key != "poverty":  # since we initialized merged_data with "poverty"
        merged_data = pd.merge(merged_data, df, on=["Entity", "Year"], how="outer")

# Display first few rows of the merged data
merged_data.head()

Unnamed: 0,Entity,Year,Share below $1 a day,Share below $2.15 a day,GDP per capita,tourists,education spending,homicides,urban access to electricity percentage,rural access to electricity percentage,access to electricity,Agricultural land hectares,Expense (% of GDP),health spending,Individuals using the Internet (% of population),Prevalence of undernourishment (% of population),Renewable electricity share,Social Expenditure GDP Percentage
0,Honduras,1989,16.559424,39.959561,3166.6462,,,,,,,3371000.0,,,,,,
1,Honduras,1990,18.560977,41.997452,3253.6936,,,495.5328,,,,3320000.0,,,0.0,,,
2,Honduras,1991,9.142681,30.774723,3317.5164,,,703.7932,88.70062,31.257088,54.781937,3342000.0,,,0.0,,,
3,Honduras,1992,6.505092,26.968343,3328.1123,,,955.10614,89.851265,37.0125,58.913113,3355000.0,,,0.0,,,
4,Honduras,1993,5.668213,21.429638,3499.39,,,1252.5227,92.08252,38.717102,61.100155,3548000.0,,,0.0,,,


In [3]:
from sklearn.impute import SimpleImputer

# Impute with mean for all columns
imputer = SimpleImputer(strategy='mean')
honduras_data_imputed = pd.DataFrame(imputer.fit_transform(merged_data))
honduras_data_imputed.columns = honduras_data.columns
honduras_data = honduras_data_imputed

# Display shape after dropping missing values
honduras_data.shape

honduras_data.head()

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'Honduras'

In [4]:
plt.figure(figsize=(10, 6))
plt.plot(honduras_data['Year'], honduras_data['Share below $1 a day'], marker='o', linestyle='-')
plt.title('Poverty Rate Over Time in Honduras')
plt.xlabel('Year')
plt.ylabel('Share below $1 a day (%)')
plt.grid(True)
plt.show()

NameError: name 'honduras_data' is not defined

<Figure size 1000x600 with 0 Axes>

In [None]:
correlations = honduras_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlations, annot=True, cmap='coolwarm', center=0, linewidths=.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
factors = ['Year', 'Exports (% of GDP)', 'GDP per capita', 'Gross domestic savings (% of GDP)', 'Industry (% of GDP)', 'Tax revenue (% of GDP)']
X = honduras_data[factors]
y = honduras_data['Share below $1 a day']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

In [None]:
print(honduras_data.columns)