In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib


In [3]:
df = pd.read_csv("/home/mpa/python_mihir/archive/global_inflation_countries.csv")
df.head()

Unnamed: 0,country_code,country_name,region,sub_region,intermediate_region,indicator_code,indicator_name,year,inflation_rate
0,AFG,AFGANISTAN,ASIA MERIDIONAL,SOUTHERN ASIA,,FP.CPI.TOTL.ZG,"INFLACION, PRECIOS AL CONSUMIDOR (% ANUAL)",1960,0.0
1,AFG,AFGANISTAN,ASIA MERIDIONAL,SOUTHERN ASIA,,FP.CPI.TOTL.ZG,"INFLACION, PRECIOS AL CONSUMIDOR (% ANUAL)",1961,0.0
2,AFG,AFGANISTAN,ASIA MERIDIONAL,SOUTHERN ASIA,,FP.CPI.TOTL.ZG,"INFLACION, PRECIOS AL CONSUMIDOR (% ANUAL)",1962,0.0
3,AFG,AFGANISTAN,ASIA MERIDIONAL,SOUTHERN ASIA,,FP.CPI.TOTL.ZG,"INFLACION, PRECIOS AL CONSUMIDOR (% ANUAL)",1963,0.0
4,AFG,AFGANISTAN,ASIA MERIDIONAL,SOUTHERN ASIA,,FP.CPI.TOTL.ZG,"INFLACION, PRECIOS AL CONSUMIDOR (% ANUAL)",1964,0.0


In [4]:
df.columns.tolist()
expected_columns = {'region', 'intermediate_region', 'country_name'}


In [5]:
df['region'].fillna('Unknow', inplace=True)
df['intermediate_region'].fillna('Unknow', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['region'].fillna('Unknow', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['intermediate_region'].fillna('Unknow', inplace=True)


In [6]:
df.info()
df.head()
expected_columns = {'region', 'intermediate_region', 'country_name'}

# 🔹 Find missing columns
missing_columns = expected_columns - set(df.columns)

if missing_columns:
    print(f"\n⚠️ Warning: Missing columns - {missing_columns}")
else:
    print("\n✅ All expected columns are present!")

# 🔹 Fill missing values ONLY if the columns exist
for col in expected_columns.intersection(df.columns):
    df[col].fillna('Unknown', inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13760 entries, 0 to 13759
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   country_code         13760 non-null  object 
 1   country_name         13760 non-null  object 
 2   region               13760 non-null  object 
 3   sub_region           13760 non-null  object 
 4   intermediate_region  13760 non-null  object 
 5   indicator_code       13760 non-null  object 
 6   indicator_name       13760 non-null  object 
 7   year                 13760 non-null  int64  
 8   inflation_rate       13760 non-null  float64
dtypes: float64(1), int64(1), object(7)
memory usage: 967.6+ KB

✅ All expected columns are present!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


In [7]:
df = pd.get_dummies(df, columns=['region','intermediate_region','country_name'], drop_first=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13760 entries, 0 to 13759
Columns: 233 entries, country_code to country_name_ZIMBABWE
dtypes: bool(227), float64(1), int64(1), object(4)
memory usage: 3.6+ MB


In [8]:
X = df.drop(columns=['inflation_rate'], errors='ignore')  # Drop target safely
Y = df['inflation_rate']

X = X.select_dtypes(include=[np.number])  # Keep only numeric columns


In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

df.head()

Unnamed: 0,country_code,sub_region,indicator_code,indicator_name,year,inflation_rate,region_ASIA MERIDIONAL,region_ASIA ORIENTAL Y EL PACÍFICO (EXCLUIDO ALTOS INGRESOS),region_EUROPA Y ASIA CENTRAL (EXCLUIDO ALTOS INGRESOS),region_ORIENTE MEDIO Y NORTE DE ÁFRICA (EXCLUIDO ALTOS INGRESOS),...,country_name_UCRANIA,country_name_UGANDA,country_name_URUGUAY,country_name_UZBEKISTAN,country_name_VANUATU,country_name_VENEZUELA,country_name_VIET NAM,"country_name_YEMEN, REP. DEL",country_name_ZAMBIA,country_name_ZIMBABWE
0,AFG,SOUTHERN ASIA,FP.CPI.TOTL.ZG,"INFLACION, PRECIOS AL CONSUMIDOR (% ANUAL)",1960,0.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,AFG,SOUTHERN ASIA,FP.CPI.TOTL.ZG,"INFLACION, PRECIOS AL CONSUMIDOR (% ANUAL)",1961,0.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,AFG,SOUTHERN ASIA,FP.CPI.TOTL.ZG,"INFLACION, PRECIOS AL CONSUMIDOR (% ANUAL)",1962,0.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,AFG,SOUTHERN ASIA,FP.CPI.TOTL.ZG,"INFLACION, PRECIOS AL CONSUMIDOR (% ANUAL)",1963,0.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,AFG,SOUTHERN ASIA,FP.CPI.TOTL.ZG,"INFLACION, PRECIOS AL CONSUMIDOR (% ANUAL)",1964,0.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
X_scaled = scaler.fit_transform(X)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)


In [12]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(x_train, y_train)


In [13]:
y_pread = model.predict(x_test)

In [14]:
mae = mean_absolute_error(y_test, y_pread)
mse = mean_squared_error(y_test, y_pread)
print(f"\n📈 MAE : {mae}")
print(f"📉 MSE : {mse}")


📈 MAE : 18.389753351497404
📉 MSE : 7755.154054788699


In [15]:
joblib.dump(model, "inflation_model.pkl")


['inflation_model.pkl']

In [16]:
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']