In [7]:
import pickle

# Load preprocessed data
with open('preprocessed_data.pkl', 'rb') as f:
    data = pickle.load(f)
    
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
sub_df_train = data['sub_df_train']
sub_df_test = data['sub_df_test']

# Verify shapes
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")


X_train shape: (29793, 17)
X_test shape: (94056, 17)
y_train shape: (29793,)

X_train head:
   views  remote_allowed  formatted_work_type_Full-time  \
0   20.0               0                           True   
1    1.0               0                           True   
2    8.0               0                           True   
3   16.0               0                           True   
4    3.0               0                           True   

   formatted_work_type_Internship  formatted_work_type_Other  \
0                           False                      False   
1                           False                      False   
2                           False                      False   
3                           False                      False   
4                           False                      False   

   formatted_work_type_Part-time  formatted_work_type_Temporary  \
0                          False                          False   
1                          False  

In [11]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate


# 1. Set up evaluation metrics (no y_test needed)
scoring = {
    'mse': make_scorer(mean_squared_error),
    'r2': make_scorer(r2_score),
    'neg_mae': 'neg_mean_absolute_error'
}

# 2. Linear Regression
print("\n=== Linear Regression (Cross-Validation) ===")
lr = LinearRegression()
lr_scores = cross_validate(lr, X_train, y_train, 
                         scoring=scoring, cv=5)
print(f"Avg MSE: {np.mean(lr_scores['test_mse']):.2f}")
print(f"Avg R²: {np.mean(lr_scores['test_r2']):.2f}")

# 3. Decision Tree Regressor
print("\n=== Decision Tree (Cross-Validation) ===")
dt = DecisionTreeRegressor(
    max_depth=5,        # Prevent overfitting
    min_samples_split=5,
    random_state=42
)
dt_scores = cross_validate(dt, X_train, y_train,
                          scoring=scoring, cv=5)
print(f"Avg MSE: {np.mean(dt_scores['test_mse']):.2f}")
print(f"Avg R²: {np.mean(dt_scores['test_r2']):.2f}")

# 4. Random Forest Regressor
print("\n=== Random Forest (Cross-Validation) ===")
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,
    min_samples_split=5,
    random_state=42
)
rf_scores = cross_validate(rf, X_train, y_train,
                          scoring=scoring, cv=5)
print(f"Avg MSE: {np.mean(rf_scores['test_mse']):.2f}")
print(f"Avg R²: {np.mean(rf_scores['test_r2']):.2f}")

# 5. K-Nearest Neighbors (with scaling)
print("\n=== KNN (Cross-Validation with Scaling) ===")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

knn = KNeighborsRegressor(
    n_neighbors=5,
    weights='uniform'
)
knn_scores = cross_validate(knn, X_train_scaled, y_train,
                          scoring=scoring, cv=5)
print(f"Avg MSE: {np.mean(knn_scores['test_mse']):.2f}")
print(f"Avg R²: {np.mean(knn_scores['test_r2']):.2f}")

# 6. Support Vector Regressor (with scaling)
print("\n=== SVR (Cross-Validation with Scaling) ===")
svr = SVR(
    kernel='rbf',
    C=1.0,
    epsilon=0.1
)
svr_scores = cross_validate(svr, X_train_scaled, y_train,
                          scoring=scoring, cv=5)
print(f"Avg MSE: {np.mean(svr_scores['test_mse']):.2f}")
print(f"Avg R²: {np.mean(svr_scores['test_r2']):.2f}")

# 7. Compile all results
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest', 'KNN', 'SVR'],
    'Avg MSE': [
        np.mean(lr_scores['test_mse']),
        np.mean(dt_scores['test_mse']),
        np.mean(rf_scores['test_mse']),
        np.mean(knn_scores['test_mse']),
        np.mean(svr_scores['test_mse'])
    ],
    'Avg R²': [
        -np.mean(lr_scores['test_r2']),
        -np.mean(dt_scores['test_r2']),
        -np.mean(rf_scores['test_r2']),
        -np.mean(knn_scores['test_r2']),
        -np.mean(svr_scores['test_r2'])
    ]
})

print("\n=== Final Comparative Results (Cross-Validated) ===")
print(results.sort_values(by='Avg R²', ascending=False))


=== Linear Regression (Cross-Validation) ===
Avg MSE: 8597419.25
Avg R²: -0.00

=== Decision Tree (Cross-Validation) ===
Avg MSE: 8698060.52
Avg R²: -0.07

=== Random Forest (Cross-Validation) ===
Avg MSE: 8709135.26
Avg R²: -0.06

=== KNN (Cross-Validation with Scaling) ===
Avg MSE: 10174063.88
Avg R²: -0.45

=== SVR (Cross-Validation with Scaling) ===
Avg MSE: 8596672.55
Avg R²: -0.00

=== Final Comparative Results (Cross-Validated) ===
               Model       Avg MSE    Avg R²
3                KNN  1.017406e+07  0.449717
1      Decision Tree  8.698061e+06  0.066139
2      Random Forest  8.709135e+06  0.056550
0  Linear Regression  8.597419e+06  0.004694
4                SVR  8.596673e+06  0.000882


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler

# Prepare classification data (using your preprocessed variables)
y_class = sub_df_train['remote_allowed']
X_class = sub_df_train.drop(['max_salary', 'remote_allowed'], axis=1)

# Split into train/test (80/20)
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42)

# Scale features for SVC (Naive Bayes doesn't need scaling)
scaler = StandardScaler()
X_class_train_scaled = scaler.fit_transform(X_class_train)
X_class_test_scaled = scaler.transform(X_class_test)

# Initialize models
models = {
    "Naive Bayes": GaussianNB(),
    "Support Vector Classifier": SVC(kernel='linear', C=1.0, random_state=42)
}

# Train and evaluate
print("=== Classification Results ===")
for name, model in models.items():
    if name == "Support Vector Classifier":
        model.fit(X_class_train_scaled, y_class_train)
        y_pred = model.predict(X_class_test_scaled)
    else:
        model.fit(X_class_train, y_class_train)
        y_pred = model.predict(X_class_test)
    
    accuracy = accuracy_score(y_class_test, y_pred)
    f1 = f1_score(y_class_test, y_pred)
    
    print(f"\n**{name}**")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_class_test, y_pred))

=== Classification Results ===

**Naive Bayes**
Accuracy: 0.5469
F1 Score: 0.3038
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.52      0.66      5096
           1       0.20      0.68      0.30       863

    accuracy                           0.55      5959
   macro avg       0.55      0.60      0.48      5959
weighted avg       0.80      0.55      0.61      5959


**Support Vector Classifier**
Accuracy: 0.8552
F1 Score: 0.0000
Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.92      5096
           1       0.00      0.00      0.00       863

    accuracy                           0.86      5959
   macro avg       0.43      0.50      0.46      5959
weighted avg       0.73      0.86      0.79      5959



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
