In [None]:
# 📓 Notebook: Population Prediction Model

# === 1. Import libraries ===
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# === 2. Load the dataset ===
data = pd.read_csv("dane_testowe.csv")
data = data.drop_duplicates()
data = data.fillna(0)

# === 3. Encode categorical variable: province ===
le = LabelEncoder()
data['province_encoded'] = le.fit_transform(data['wojewodztwo'])

# === 4. Add new column: population density ===
data['population_density'] = data['ludnosc'] / data['powierzchnia_km2']

# === 5. Data exploration ===
plt.figure(figsize=(6,4))
plt.hist(data['ludnosc'], bins=10, edgecolor='black')
plt.title('Distribution of Population Size')
plt.xlabel('Population')
plt.ylabel('Number of Cities')
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
plt.scatter(data['powierzchnia_km2'], data['ludnosc'])
plt.title('City Area vs Population')
plt.xlabel('Area (km²)')
plt.ylabel('Population')
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
sns.heatmap(data.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()

# === 6. Ranking provinces by average population ===
print("Ranking by average population:")
print(data.groupby('wojewodztwo')['ludnosc'].mean().sort_values(ascending=False))

# === 7. Prepare data for machine learning ===
X = data[['powierzchnia_km2', 'province_encoded', 'population_density']]
y = data['ludnosc']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# === 8. Train and evaluate models ===
model_linear = LinearRegression()
model_linear.fit(X_train, y_train)
y_pred_linear = model_linear.predict(X_test)
mae_linear = mean_absolute_error(y_test, y_pred_linear)
print("LinearRegression MAE:", round(mae_linear))

model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print("RandomForest MAE:", round(mae_rf))