In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the data
df = pd.read_csv('data.csv')
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nBasic statistics:")
print(df.describe())

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Prepare data for regression
# Target variable
y = df['Score']

# Features (independent variables) - exclude rank, country, and score
X = df[['GDP per capita', 'Social support', 'Healthy life expectancy', 
         'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']]

print("Features used in the model:")
print(X.columns.tolist())
print("\nTarget variable: Score")
print(f"Number of samples: {len(X)}")
print(f"Number of features: {X.shape[1]}")

# Check for missing values
print("\nMissing values:")
print(X.isnull().sum())
print(f"Score missing values: {y.isnull().sum()}")

In [None]:
# Manual train/test split (no sklearn)
def train_test_split_manual(X, y, test_size=0.2, random_state=42):
    n = len(X)
    rnd = np.random.RandomState(random_state)
    perm = rnd.permutation(n)
    test_n = int(n * test_size)
    test_idx = perm[:test_n]
    train_idx = perm[test_n:]
    return X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]

X_train, X_test, y_train, y_test = train_test_split_manual(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# Fit multiple linear regression using the normal equation (NumPy)
# Add intercept column
X_train_mat = np.hstack([np.ones((X_train.shape[0], 1)), X_train.values])
# theta (coefficients) via Moore-Penrose pseudo-inverse
theta = np.linalg.pinv(X_train_mat) @ y_train.values

# Make predictions
y_train_pred = X_train_mat @ theta
X_test_mat = np.hstack([np.ones((X_test.shape[0], 1)), X_test.values])
y_test_pred = X_test_mat @ theta

print("\n" + "="*50)
print("MULTIPLE REGRESSION MODEL RESULTS (Manual)")
print("="*50)

# Model coefficients
print("\nModel Coefficients:")
for feature, coef in zip(X.columns, theta[1:]):
    print(f"  {feature}: {coef:.6f}")
print(f"  Intercept: {theta[0]:.6f}")

In [None]:
# Manual evaluation metrics - Training Set
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def rmse(y_true, y_pred):
    return math.sqrt(mse(y_true, y_pred))

def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

def r2_score_manual(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot) if ss_tot != 0 else float('nan')

train_mse = mse(y_train.values, y_train_pred)
train_rmse = rmse(y_train.values, y_train_pred)
train_mae = mae(y_train.values, y_train_pred)
train_r2 = r2_score_manual(y_train.values, y_train_pred)

print("\nTraining Set Metrics:")
print(f"  Mean Squared Error (MSE): {train_mse:.6f}")
print(f"  Root Mean Squared Error (RMSE): {train_rmse:.6f}")
print(f"  Mean Absolute Error (MAE): {train_mae:.6f}")
print(f"  R² Score: {train_r2:.6f}")

# Manual evaluation metrics - Testing Set
test_mse = mse(y_test.values, y_test_pred)
test_rmse = rmse(y_test.values, y_test_pred)
test_mae = mae(y_test.values, y_test_pred)
test_r2 = r2_score_manual(y_test.values, y_test_pred)

print("\nTesting Set Metrics:")
print(f"  Mean Squared Error (MSE): {test_mse:.6f}")
print(f"  Root Mean Squared Error (RMSE): {test_rmse:.6f}")
print(f"  Mean Absolute Error (MAE): {test_mae:.6f}")
print(f"  R² Score: {test_r2:.6f}")

In [None]:
# Visualization 1: Actual vs Predicted values for Test Set
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(y_test, y_test_pred, alpha=0.6, s=100, edgecolors='k')
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction')
ax.set_xlabel('Actual Score', fontsize=12)
ax.set_ylabel('Predicted Score', fontsize=12)
ax.set_title('Actual vs Predicted Score (Test Set)', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Visualization 1: Actual vs Predicted values created")

In [None]:
# Visualization 2: Feature Importance (Coefficient Magnitudes) - using manual theta
coefficients_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': theta[1:]
}).assign(Abs=lambda df: df['Coefficient'].abs()).sort_values('Abs', ascending=False).drop(columns=['Abs'])

fig, ax = plt.subplots(figsize=(10, 6))
colors = ['green' if x > 0 else 'red' for x in coefficients_df['Coefficient']]
ax.barh(coefficients_df['Feature'], coefficients_df['Coefficient'], color=colors, alpha=0.7, edgecolor='k')
ax.set_xlabel('Coefficient Value', fontsize=12)
ax.set_title('Feature Coefficients in Multiple Regression Model (Manual)', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print("\nVisualization 2: Feature Coefficients created")

In [None]:
# Visualization 3: Residuals Analysis
residuals_test = y_test - y_test_pred

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Residuals scatter plot
axes[0].scatter(y_test_pred, residuals_test, alpha=0.6, s=100, edgecolors='k')
axes[0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[0].set_xlabel('Predicted Score', fontsize=12)
axes[0].set_ylabel('Residuals', fontsize=12)
axes[0].set_title('Residual Plot', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Distribution of residuals
axes[1].hist(residuals_test, bins=15, edgecolor='k', alpha=0.7, color='skyblue')
axes[1].set_xlabel('Residuals', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Distribution of Residuals', fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("Visualization 3: Residuals Analysis created")

In [None]:
# Visualization 4: Correlation heatmap
fig, ax = plt.subplots(figsize=(10, 8))
data_for_corr = pd.concat([X, y], axis=1)
correlation_matrix = data_for_corr.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax)
ax.set_title('Correlation Matrix - Features and Score', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("Visualization 4: Correlation heatmap created")

In [None]:
# Model Equation (manual)
print("\n" + "="*50)
print("REGRESSION EQUATION")
print("="*50)

equation = f"Score = {theta[0]:.6f}"
for feature, coef in zip(X.columns, theta[1:]):
    sign = "+" if coef >= 0 else "-"
    equation += f" {sign} {abs(coef):.6f} * {feature}"

print(f"\n{equation}")

# Summary statistics
print("\n" + "="*50)
print("MODEL SUMMARY")
print("="*50)
print(f"\nNumber of features: {len(theta) - 1}")
print(f"Number of training samples: {len(X_train)}")
print(f"Number of testing samples: {len(X_test)}")
print(f"\nTraining R² Score: {train_r2:.6f}")
print(f"Testing R² Score: {test_r2:.6f}")
print(f"\nOverfitting indicator: {abs(train_r2 - test_r2):.6f}")
if abs(train_r2 - test_r2) < 0.05:
    print("Status: Model shows good generalization (low overfitting)")
else:
    print("Status: Model may be overfitting")