# India Inflation Analysis - Trying to predict the unpredictable

Attempting to model India's inflation using some basic economic indicators. Using data from 2010-2023 to train a linear regression model (probably too simple but let's see) and then forecasting to 2030.

**Warning:** The 2020-2021 period is going to mess with our model big time due to COVID disruptions.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np  # might need this for some calculations
# from sklearn.preprocessing import StandardScaler  # keeping this handy in case

# THE PLOTS ARE LOOKING DECENT
plt.style.use('default')
sns.set_palette("husl")

In [None]:
# Loading the historical data - hope the Excel files aren't corrupted
india_inflation_data = pd.read_excel('raw_data_2010_2023.xlsx')
india_inflation_data['Date'] = pd.to_datetime(india_inflation_data['Date'])

# Forecast data - not sure how reliable these predictions are but lets start with what we have
future_predictions = pd.read_excel('forecast_data_2025_2030.xlsx')
future_predictions['Date'] = pd.to_datetime(future_predictions['Date'])

In [None]:
# Quick checing on the data
print(f"Historical data shape: {india_inflation_data.shape}")
print(f"Any missing values? {india_inflation_data.isnull().sum().sum()}")
print("\nFirst few rows:")
india_inflation_data.head()

In [None]:
# Let's see the range of our inflation data
print("Inflation stats (2010-2023):")
print(f"Min: {india_inflation_data['Inflation (%)'].min():.2f}%")
print(f"Max: {india_inflation_data['Inflation (%)'].max():.2f}%")
print(f"Average: {india_inflation_data['Inflation (%)'].mean():.2f}%")
print(f"Std Dev: {india_inflation_data['Inflation (%)'].std():.2f}%")

## How well did our model do? (Actual vs Predicted 2010-2023)

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

# Plot actual inflation with a thicker line since it's the real deal
ax.plot(india_inflation_data['Date'], india_inflation_data['Inflation (%)'], 
        label='Actual Inflation', color='darkred', linewidth=2.5)

# Model predictions should be more subdued since they're just estimates
ax.plot(india_inflation_data['Date'], india_inflation_data['Predicted Inflation (%)'], 
        label='Model Predictions', linestyle='--', color='steelblue', linewidth=2, alpha=0.8)

ax.set_title('Actual vs Predicted Inflation (2010-2023) - Linear Regression Model', fontsize=14)
ax.set_xlabel('Year')
ax.set_ylabel('Inflation Rate (%)')
ax.legend()
ax.grid(True, alpha=0.3)

# Highlight the COVID period
ax.axvspan(pd.to_datetime('2020-01-01'), pd.to_datetime('2021-12-31'), 
           alpha=0.2, color='orange', label='COVID Period')

plt.tight_layout()
plt.show()

## What we think might happen (2025-2030 Forecast)

Take these predictions with a huge grain of salt - economic forecasting is notoriously difficult, especially 5+ years out.

In [None]:
plt.figure(figsize=(12, 6))

# Using markers to show these are discrete predictions, not continuous data
plt.plot(future_predictions['Date'], future_predictions['Predicted Inflation (%)'], 
         marker='o', markersize=6, label='Forecasted Inflation', 
         color='forestgreen', linewidth=2, linestyle='-')

# Add some uncertainty visualization
plt.fill_between(future_predictions['Date'], 
                future_predictions['Predicted Inflation (%)'] - 0.5,
                future_predictions['Predicted Inflation (%)'] + 0.5,
                alpha=0.2, color='forestgreen', label='Uncertainty Range (±0.5%)')

plt.title('Inflation Forecast 2025-2030 (Proceed with Caution)', fontsize=14)
plt.xlabel('Year')
plt.ylabel('Predicted Inflation (%)')
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

print(f"Average predicted inflation (2025-2030): {future_predictions['Predicted Inflation (%)'].mean():.2f}%")

## How do our economic indicators relate to each other?

In [None]:
# Drop the date column and any non-numeric columns for correlation
numeric_data = india_inflation_data.select_dtypes(include=[np.number])
correlation_matrix = numeric_data.corr()

plt.figure(figsize=(10, 8))
# Tried coolwarm but YlOrRd looks better for this data
sns.heatmap(correlation_matrix, annot=True, cmap='YlOrRd', fmt='.2f', 
            square=True, linewidths=0.5)
plt.title('Correlation Matrix - How Economic Indicators Move Together', fontsize=14)
plt.tight_layout()
plt.show()

# Let's see which factors correlate most with inflation
inflation_corr = correlation_matrix['Inflation (%)'].sort_values(key=abs, ascending=False)
print("\nCorrelation with Inflation (strongest to weakest):")
for factor, corr in inflation_corr.items():
    if factor != 'Inflation (%)':
        print(f"{factor}: {corr:.3f}")

## Economic Indicators Over Time - The Story Behind the Numbers

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Key Economic Indicators (2010-2023)', fontsize=16)

# Oil prices - always volatile
axes[0,0].plot(india_inflation_data['Date'], india_inflation_data['Crude Oil Price (USD)'], 
              color='black', linewidth=2)
axes[0,0].set_title('Crude Oil Price (USD)')
axes[0,0].grid(True, alpha=0.3)
axes[0,0].set_ylabel('Price (USD)')

# Food prices - critical for India
axes[0,1].plot(india_inflation_data['Date'], india_inflation_data['Food Price Index'], 
              color='brown', linewidth=2)
axes[0,1].set_title('Food Price Index')
axes[0,1].grid(True, alpha=0.3)
axes[0,1].set_ylabel('Index Value')

# Exchange rate - rupee weakness = higher inflation usually
axes[1,0].plot(india_inflation_data['Date'], india_inflation_data['INR/USD Exchange Rate'], 
              color='orange', linewidth=2)
axes[1,0].set_title('INR/USD Exchange Rate')
axes[1,0].grid(True, alpha=0.3)
axes[1,0].set_ylabel('INR per USD')
axes[1,0].set_xlabel('Year')

# Inflation itself
axes[1,1].plot(india_inflation_data['Date'], india_inflation_data['Inflation (%)'], 
              color='red', linewidth=2)
axes[1,1].set_title('Inflation Rate')
axes[1,1].grid(True, alpha=0.3)
axes[1,1].set_ylabel('Inflation (%)')
axes[1,1].set_xlabel('Year')

plt.tight_layout()
plt.show()

## Model Performance Check - How good is our simple linear regression?

Spoiler alert: probably not great, but let's quantify it.

In [None]:
# Define our features (predictors)
feature_columns = ['Crude Oil Price (USD)', 'Food Price Index', 'INR/USD Exchange Rate']
X = india_inflation_data[feature_columns]
y = india_inflation_data['Inflation (%)']

# Fit our basic linear model
lr_model = LinearRegression()
lr_model.fit(X, y)
predictions = lr_model.predict(X)

# Calculate performance metrics
r2 = r2_score(y, predictions)
mse = np.mean((y - predictions) ** 2)
rmse = np.sqrt(mse)

print(f"Model Performance:")
print(f"R² Score: {r2:.3f} ({r2*100:.1f}% of variance explained)")
print(f"RMSE: {rmse:.3f} percentage points")
print(f"Mean Absolute Error: {np.mean(np.abs(y - predictions)):.3f} percentage points")

print("\nFeature Importance (coefficients):")
for feature, coef in zip(feature_columns, lr_model.coef_):
    print(f"{feature}: {coef:.4f}")
    
print(f"\nIntercept: {lr_model.intercept_:.4f}")

In [None]:
# Let's look at prediction errors over time
residuals = y - predictions

plt.figure(figsize=(12, 5))
plt.scatter(india_inflation_data['Date'], residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', alpha=0.7)
plt.title('Model Residuals Over Time (Bigger dots = bigger errors)', fontsize=14)
plt.xlabel('Year')
plt.ylabel('Prediction Error (Actual - Predicted)')
plt.grid(True, alpha=0.3)

# Highlight periods with large errors
large_errors = np.abs(residuals) > 2  # errors > 2 percentage points
if large_errors.any():
    error_dates = india_inflation_data['Date'][large_errors]
    error_values = residuals[large_errors]
    plt.scatter(error_dates, error_values, color='red', s=100, alpha=0.8, 
               label=f'Large Errors (>{2}pp)')
    plt.legend()

plt.tight_layout()
plt.show()

print(f"Largest prediction error: {np.max(np.abs(residuals)):.2f} percentage points")
print(f"Number of predictions off by >2pp: {np.sum(np.abs(residuals) > 2)}")

## Key Takeaways and Issues

**What worked:**
- The model captures general trends reasonably well
- Strong correlation between some economic indicators and inflation

**What didn't work:**
- Linear regression is probably too simple for this complex economic relationship
- Missing crucial factors: monetary policy, GDP growth, supply chain disruptions
- 2020-2021 COVID period messed up our predictions badly
- Exchange rate and oil prices might have non-linear effects

**Bottom line:** This model gives us a starting point, but I am not sure on these 2030 predictions!