In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("export-APS-2025-10-07_02-14-41.csv",header = 1)
df

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.size

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop(columns=['Entrepreneurial Employee Activity', 'Motivational Index',
                 'Female/Male Opportunity-Driven TEA', 'Innovation'], inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df['High Status to Successful Entrepreneurs'].fillna(df['High Status to Successful Entrepreneurs'].mean(), inplace=True)
df['Entrepreneurship as a Good Career Choice'].fillna(df['Entrepreneurship as a Good Career Choice'].mean(), inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Entrepreneurial intentions', data=df)
plt.title("Distribution of Entrepreneurship Intention")
plt.show()

In [None]:
X = pd.get_dummies(df.drop(columns=['code', 'Entrepreneurial intentions']), columns=['economy'])
y = df['Entrepreneurial intentions']

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(y, bins=20, kde=True)
plt.title("Distribution of Entrepreneurship Intention")
plt.xlabel("Entrepreneurial Intentions (%)")
plt.ylabel("Count")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# -------------------------------
# 1️⃣ Import models and metrics
# -------------------------------
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# -------------------------------
# 2️⃣ Train Regression Models
# -------------------------------

# KNN
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# SVR
svr = SVR(kernel='rbf')
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# ANN
ann = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')  # Linear output for regression
])
ann.compile(optimizer='adam', loss='mse', metrics=['mae'])
ann.fit(X_train, y_train, epochs=100, batch_size=8, verbose=0, validation_split=0.1)
y_pred_ann = ann.predict(X_test).flatten()

# -------------------------------
# 3️⃣ Evaluate Models
# -------------------------------
def evaluate_model(y_true, y_pred, name):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{name} -> MSE: {mse:.2f}, R²: {r2:.2f}")

models = {
    "KNN": y_pred_knn,
    "SVR": y_pred_svr,
    "Decision Tree": y_pred_dt,
    "Random Forest": y_pred_rf,
    "ANN": y_pred_ann
}

for name, pred in models.items():
    evaluate_model(y_test, pred, name)

# -------------------------------
# 4️⃣ Pseudo-Accuracy (Optional)
# -------------------------------
def pseudo_accuracy(y_true, y_pred, tol=5):
    correct = np.abs(y_true - y_pred) <= tol
    return np.mean(correct)

for name, pred in models.items():
    acc = pseudo_accuracy(y_test, pred)
    print(f"{name} pseudo-accuracy (±5%): {acc*100:.2f}%")

# -------------------------------
# 5️⃣ Visual Comparison of R² Scores
# -------------------------------
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
plt.bar(models.keys(), [r2_score(y_test, pred) for pred in models.values()], color='skyblue')
plt.ylabel("R² Score")
plt.title("Comparison of Regression Models")
plt.ylim(0,1)
plt.show()
