In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.preprocessing import MinMaxScaler

# Directory where the CSV files are saved
csv_files_directory = './'  # Change this to the directory where the CSV files are saved

# Get a list of all CSV files that end with '_pop_data.csv'
csv_files = [file for file in os.listdir(csv_files_directory) if file.endswith('.csv')]

# Initialize a list to store DataFrames for each country
country_dataframes = []

# Loop through the CSV files and read them into DataFrames
for csv_file in csv_files:
    country_df = pd.read_csv(csv_file)
    # print(os.path.splitext(csv_file)[0])
    # country_df = country_df.sort_values(by='Year', ascending=True)
    country_dataframes.append(country_df)



# Initialize a list to store the MAE and MSE for each country
mae_results_rf = []
mse_results_rf = []
mae_results_lr = []
mse_results_lr = []
mape_results_lr = []
mape_results_rf = []

In [3]:
import matplotlib.pyplot as plt
from matplotlib import pyplot

# Loop through each DataFrame and perform the required operations
for df, csv_file in zip(country_dataframes, csv_files):
    # Extract the country name from the CSV file name
    country_name = os.path.splitext(csv_file)[0]
    print("Processing:", country_name)

    df = df.sort_values(by='Year', ascending=True)
    year = df['Year'].astype(str)
    df['Year'] = pd.to_datetime(year)
    # Set 'date' column as the index
    df.set_index('Year', inplace=True)


    df['Prediction']=df[['Population']].shift(-6)
    X = np.array(df.drop(labels=['Prediction'],axis=1))
    X=X[:-6]
    y = np.array(df['Prediction'])
    y=y[:-6]

    # Assuming the 'Year' column contains the year information and 'Population' is the target variable
    # X = df[['Year']].values
    # y = df['Population'].values

    # Normalize the data
    scaler = MinMaxScaler()
    # X_scaled = scaler.fit_transform(X)
    # y_scaled = scaler.fit_transform(y.reshape(-1, 1)).flatten()

    # Splitting the data into training and testing sets
    # print(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Creating the Random Forest model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

    # Training the Random Forest model
    rf_model.fit(X_train, y_train)

    # Making predictions with the Random Forest model
    y_pred_rf = rf_model.predict(X_test)

    # Evaluating the Random Forest model
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    mae_rf = mean_absolute_error(y_test, y_pred_rf)
    mape_rf =mean_absolute_percentage_error(y_test, y_pred_rf)
    # Creating the Linear Regression model
    lr_model = LinearRegression()

    # Training the Linear Regression model
    lr_model.fit(X_train, y_train)

    # Making predictions with the Linear Regression model
    y_pred_lr = lr_model.predict(X_test)

    # Evaluating the Linear Regression model
    mse_lr = mean_squared_error(y_test, y_pred_lr)
    mae_lr = mean_absolute_error(y_test, y_pred_lr)
    mape_lr=mean_absolute_percentage_error(y_test, y_pred_lr)


    # Save predictions and evaluation metrics for both models
    y_pred_rf_df = pd.DataFrame({'Year': X_test[:, 0], 'Population': y_pred_rf})
    # y_pred_rf_df.to_csv(f'{country_name}_y_pred_rf.csv', index=False)

    y_pred_lr_df = pd.DataFrame({'Year': X_test[:, 0], 'Population': y_pred_lr})
    # y_pred_lr_df.to_csv(f'{country_name}_y_pred_lr.csv', index=False)

    results_rf_df = pd.DataFrame({'MSE': [mse_rf], 'MAE': [mae_rf]})
    # results_rf_df.to_csv(f'{country_name}_results_rf.csv', index=False)

    results_lr_df = pd.DataFrame({'MSE': [mse_lr], 'MAE': [mae_lr]})
    # results_lr_df.to_csv(f'{country_name}_results_lr.csv', index=False)


    # Append MAE and MSE for both models to the corresponding lists
    mae_results_rf.append(mae_rf)
    mse_results_rf.append(mse_rf)
    mape_results_rf.append(mse_rf)

    mae_results_lr.append(mae_lr)
    mse_results_lr.append(mse_lr)

    mape_results_lr.append(mse_lr)

    # print(mape_results_lr)
    forecast=np.array(df.drop(labels=['Prediction'],axis=1))[-6:]

    pred=lr_model.predict(forecast)
    # Round the prediction values to integers (without decimal places)
    pred = pred.astype(int)

    # Creating a DataFrame for the forecast values with corresponding dates
    forecast_dates = pd.date_range(start=df.index[-1], periods=len(pred)+1, freq='A')[-len(pred):]
    forecast_df = pd.DataFrame({'Population': pred}, index=forecast_dates)

    # Merging the original DataFrame and the forecast DataFrame
    merged_df = pd.concat([df, forecast_df])

    merged_df.to_csv(f'pop_forecast_final_{country_name}.csv')# print(mae_results_rf)

    # Plot the predictions from both models
    plt.figure(figsize=(10, 6))
    pyplot.plot(y_test, color='b', label='True Data')
    pyplot.plot(y_pred_rf, color='r', label='Random Forest Predictions')
    pyplot.plot(y_pred_lr, color='g', label='Linear Regression Predictions')


    # Plot actual and predicted POP values

    plt.title(f'{country_name} Population Prediction')
    plt.legend()
    plt.savefig(f'{country_name}_predictions_plot.png')
    plt.close()


# Calculate the average MSE for both models
avg_mse_rf = np.mean(mse_results_rf)
avg_mse_lr = np.mean(mse_results_lr)
avg_mape_lr = np.mean(mape_results_lr)

print("Average MSE for Random Forest:", avg_mse_rf)
print("Average MSE for Linear Regression:", avg_mse_lr)
print("Average MSE for Linear Regression:", avg_mape_lr)

Processing: ethiopia
Processing: netherlands
Processing: iraq
Processing: bahamas
Processing: barbados
Processing: albania
Processing: lebanon
Processing: singapore
Processing: peru
Processing: eritrea
Processing: chad
Processing: denmark
Processing: palau
Processing: belarus
Processing: zimbabwe
Processing: italy
Processing: new-zealand
Processing: liechtenstein
Processing: romania
Processing: northern-mariana-islands
Processing: chile
Processing: lithuania
Processing: slovenia
Processing: norway
Processing: mexico
Processing: botswana
Processing: united-states
Processing: belize
Processing: japan
Processing: malta
Processing: uruguay
Processing: ireland
Processing: finland
Processing: croatia
Processing: malawi
Processing: fiji
Processing: jordan
Processing: niger
Processing: qatar
Processing: azerbaijan
Processing: germany
Processing: togo
Processing: gambia
Processing: hong-kong
Processing: bulgaria
Processing: canada
Processing: guyana
Processing: united-kingdom
Processing: yemen
