In [1]:
import warnings
warnings.filterwarnings('ignore')

# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Ensure inline plotting
%matplotlib inline

# Use 'Agg' backend for matplotlib
matplotlib.use('Agg')

In [2]:
file_path = r'C:\Users\dell\Desktop\MyDocs\Docs\MK\estat_nama_10_exi.csv'
df = pd.read_csv(file_path, delimiter=',', encoding='ascii')

# Display the first few rows of the dataset
df.head()

Unnamed: 0,"frq,unit,na_itm,go\TIM_RIOD",1975,1976,1977,1978,1979,1980,1981,1982,1983,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,"A,CLV05_MUR,6,AL",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3678.6 b,2658.7,4043.8,4733.0,5180.4
1,"A,CLV05_MUR,6,AT",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,157376.2,162139.3,166991.9,175173.8,184196.9,191629.7,171529.9,187785.2,206557.2,205686.1
2,"A,CLV05_MUR,6,B",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,281452.4,291781.0,309709.6,326794.0,331842.9,341587.6,321236.3,368303.2,389601.6,361811.8
3,"A,CLV05_MUR,6,BG",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,17163.7,18275.7,19849.7,20989.7,21353.5,22206.8,20073.0,22396.4,25098.6,25098.6
4,"A,CLV05_MUR,6,CH",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,260968.2,271469.0,287696.5,286612.0,296279.1,294269.7,278849.4,317178.4,336538.3,338915.7


In [3]:
df.isnull().sum()

frq,unit,na_itm,go\TIM_RIOD    0
1975                           0
1976                           0
1977                           0
1978                           0
1979                           0
1980                           0
1981                           0
1982                           0
1983                           0
1984                           0
1985                           0
1986                           0
1987                           0
1988                           0
1989                           0
1990                           0
1991                           0
1992                           0
1993                           0
1994                           0
1995                           0
1996                           0
1997                           0
1998                           0
1999                           0
2000                           0
2001                           0
2002                           0
2003                           0
2004      

In [4]:
for col in df.columns[1:]:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
df.iloc[:, 1:] = imputer.fit_transform(df.iloc[:, 1:])

# Verify that there are no more missing values
df.isnull().sum().sum()

0

In [5]:
numeric_df = df.select_dtypes(include=[np.number])

# Plot a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [6]:
X = numeric_df.drop(columns=['2023'])
y = numeric_df['2023']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2

(280468224272.6808, 0.9850889034634739)