In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression

# Step 1: Simulate a realistic dataset
np.random.seed(42)
df = pd.DataFrame({
    'Age': np.random.randint(20, 60, 100),
    'Salary': np.random.randint(30000, 120000, 100),
    'Experience': np.random.randint(0, 20, 100),
    'Education': np.random.choice(['High School', 'Bachelors', 'Masters', 'PhD'], 100),
    'City': np.random.choice(['NY', 'LA', 'SF', 'Chicago'], 100),
    'Target': np.random.uniform(50000, 150000, 100)  # Regression target
})

# Add some missing values and outliers
df.loc[5:10, 'Salary'] = np.nan
df.loc[3, 'Age'] = 150  # outlier

# Step 2: Define feature groups
numerical_features = ['Age', 'Salary', 'Experience']
categorical_features = ['Education', 'City']

# Step 3: Create preprocessors
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),        # Handle missing
    ("scaler", StandardScaler())                          # Scale
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))    # Encode
])

# Step 4: Combine with ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_pipeline, numerical_features),
    ("cat", categorical_pipeline, categorical_features)
])

# Step 5: Build full pipeline with model
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("feature_selection", SelectKBest(score_func=f_regression, k='all')),  # You can set k=5 for top 5
    ("regressor", LinearRegression())
])

# Step 6: Split data
X = df.drop("Target", axis=1)
y = df["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the model
model_pipeline.fit(X_train, y_train)

# Step 8: Evaluate
y_pred = model_pipeline.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

# Optional: Cross-validation
cv_scores = cross_val_score(model_pipeline, X, y, cv=5, scoring='r2')
print("Cross-validated R² scores:", cv_scores)
print("Average R²:", np.mean(cv_scores))


R² Score: -0.5080175709423214
MSE: 921557814.7863477
Cross-validated R² scores: [-0.48910974 -0.11530741 -0.3261574  -0.39049531 -0.12709978]
Average R²: -0.28963392957926126


In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression


# Step 1: Simulate a realistic dataset
np.random.seed(42)
df = pd.DataFrame({
    'Age': np.random.randint(20, 60, 100),
    'Salary': np.random.randint(30000, 120000, 100),
    'Experience': np.random.randint(0, 20, 100),
    'Education': np.random.choice(['High School', 'Bachelors', 'Masters', 'PhD'], 100),
    'City': np.random.choice(['NY', 'LA', 'SF', 'Chicago'], 100),
    'Target': np.random.uniform(50000, 150000, 100)  # Regression target
})

# Add some missing values and outliers
df.loc[5:10, 'Salary'] = np.nan
df.loc[3, 'Age'] = 150  # outlier

# Step 2: Define feature groups
numerical_features = ['Age', 'Salary', 'Experience']
categorical_features = ['Education', 'City']

# Step 3: Create preprocessors
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),        # Handle missing
    ("scaler", StandardScaler())                          # Scale
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))    # Encode
])

# Step 4: Combine with ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_pipeline, numerical_features),
    ("cat", categorical_pipeline, categorical_features)
])

# Step 5: Build full pipeline with model
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("feature_selection", SelectKBest(score_func=f_regression, k='all')),  # You can set k=5 for top 5
    ("regressor", RandomForestRegressor())
])

# Step 6: Split data
X = df.drop("Target", axis=1)
y = df["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the model
model_pipeline.fit(X_train, y_train)

# Step 8: Evaluate
y_pred = model_pipeline.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

# Optional: Cross-validation
cv_scores = cross_val_score(model_pipeline, X, y, cv=5, scoring='r2')
print("Cross-validated R² scores:", cv_scores)
print("Average R²:", np.mean(cv_scores))

R² Score: -0.6039556737677059
MSE: 980186116.0065136
Cross-validated R² scores: [-0.09356489 -0.35854617 -0.327535   -0.13176058 -0.18850697]
Average R²: -0.21998272010979747


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from xgboost import XGBRegressor


# Step 1: Simulate a realistic dataset
np.random.seed(42)
df = pd.DataFrame({
    'Age': np.random.randint(20, 60, 100),
    'Salary': np.random.randint(30000, 120000, 100),
    'Experience': np.random.randint(0, 20, 100),
    'Education': np.random.choice(['High School', 'Bachelors', 'Masters', 'PhD'], 100),
    'City': np.random.choice(['NY', 'LA', 'SF', 'Chicago'], 100),
    'Target': np.random.uniform(50000, 150000, 100)  # Regression target
})

# Add some missing values and outliers
df.loc[5:10, 'Salary'] = np.nan
df.loc[3, 'Age'] = 150  # outlier

# Step 2: Define feature groups
numerical_features = ['Age', 'Salary', 'Experience']
categorical_features = ['Education', 'City']

# Step 3: Create preprocessors
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),        # Handle missing
    ("scaler", StandardScaler())                          # Scale
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))    # Encode
])

# Step 4: Combine with ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_pipeline, numerical_features),
    ("cat", categorical_pipeline, categorical_features)
])

# Step 5: Build full pipeline with model
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("feature_selection", SelectKBest(score_func=f_regression, k='all')),  # You can set k=5 for top 5
    ("regressor", XGBRegressor())
])

# Step 6: Split data
X = df.drop("Target", axis=1)
y = df["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the model
model_pipeline.fit(X_train, y_train)

# Step 8: Evaluate
y_pred = model_pipeline.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

# Optional: Cross-validation
cv_scores = cross_val_score(model_pipeline, X, y, cv=5, scoring='r2')
print("Cross-validated R² scores:", cv_scores)
print("Average R²:", np.mean(cv_scores))

R² Score: -1.464935256431548
MSE: 1506335464.7037907
Cross-validated R² scores: [-0.27920506 -1.02019806 -0.71152508 -0.22242395 -0.10840132]
Average R²: -0.4683506942119656


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
from xgboost import XGBRegressor

# Step 1: Simulate a realistic dataset
np.random.seed(42)
df = pd.DataFrame({
    'Age': np.random.randint(20, 60, 100),
    'Salary': np.random.randint(30000, 120000, 100),
    'Experience': np.random.randint(0, 20, 100),
    'Education': np.random.choice(['High School', 'Bachelors', 'Masters', 'PhD'], 100),
    'City': np.random.choice(['NY', 'LA', 'SF', 'Chicago'], 100),
    'Target': np.random.uniform(50000, 150000, 100)
})

# Add missing values and outliers
df.loc[5:10, 'Salary'] = np.nan
df.loc[3, 'Age'] = 150  # outlier

# Step 2: Define features
numerical_features = ['Age', 'Salary', 'Experience']
categorical_features = ['Education', 'City']

# Step 3: Pipelines
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numerical_pipeline, numerical_features),
    ("cat", categorical_pipeline, categorical_features)
])

# Step 4: Full pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_selection", SelectKBest(score_func=f_regression, k='all')),
    ("regressor", XGBRegressor(objective='reg:squarederror', random_state=42))
])

# Step 5: Parameter Grid for XGBoost
param_grid = {
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [3, 5],
    'regressor__learning_rate': [0.01, 0.1],
    'regressor__subsample': [0.8, 1.0]
}

# Step 6: GridSearchCV
X = df.drop("Target", axis=1)
y = df["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Step 7: Evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


Best Parameters: {'regressor__learning_rate': 0.01, 'regressor__max_depth': 5, 'regressor__n_estimators': 50, 'regressor__subsample': 0.8}
R² Score: -0.3114067037373993
MSE: 801407834.682736


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
from xgboost import XGBRegressor

# Step 1: Simulate a realistic dataset
np.random.seed(42)
df = pd.DataFrame({
    'Age': np.random.randint(20, 60, 100),
    'Salary': np.random.randint(30000, 120000, 100),
    'Experience': np.random.randint(0, 20, 100),
    'Education': np.random.choice(['High School', 'Bachelors', 'Masters', 'PhD'], 100),
    'City': np.random.choice(['NY', 'LA', 'SF', 'Chicago'], 100),
    'Target': np.random.uniform(50000, 150000, 100)
})

# Add missing values and outliers
df.loc[5:10, 'Salary'] = np.nan
df.loc[3, 'Age'] = 150  # outlier

# Step 2: Define features
numerical_features = ['Age', 'Salary', 'Experience']
categorical_features = ['Education', 'City']

# Step 3: Pipelines
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numerical_pipeline, numerical_features),
    ("cat", categorical_pipeline, categorical_features)
])

# Step 4: Full pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_selection", SelectKBest(score_func=f_regression, k='all')),
    ("regressor", XGBRegressor(objective='reg:squarederror', random_state=42))
])

# Step 5: Parameter Grid for XGBoost
param_grid = {
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [3, 5],
    'regressor__learning_rate': [0.01, 0.1],
    'regressor__subsample': [0.8, 1.0]
}

# Step 6: GridSearchCV
X = df.drop("Target", axis=1)
y = df["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Random_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)  # Model 
Random_search.fit(X_train, y_train) # fit

# Step 7: Evaluation
best_model = Random_search.best_estimator_ # Additional
y_pred = best_model.predict(X_test)

print("Best Parameters:", Random_search.best_params_)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

Best Parameters: {'regressor__subsample': 0.8, 'regressor__n_estimators': 50, 'regressor__max_depth': 5, 'regressor__learning_rate': 0.01}
R² Score: -0.3114067037373993
MSE: 801407834.682736


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
from xgboost import XGBRegressor

# Step 1: Simulate a realistic dataset

np.random.seed(42)
df = pd.DataFrame({
    'Age': np.random.randint(20, 60, 100),
    'Salary': np.random.randint(30000, 120000, 100),
    'Experience': np.random.randint(0, 20, 100),
    'Education': np.random.choice(['High School', 'Bachelors', 'Masters', 'PhD'], 100),
    'City': np.random.choice(['NY', 'LA', 'SF', 'Chicago'], 100)
})

# Create a logical target
df['Target'] = (
    df['Salary'] * 0.6 +
    df['Experience'] * 2000 +
    df['Age'] * 100 +
    np.random.normal(0, 5000, 100)  # Add some noise
)
# Step 2: Define features
numerical_features = ['Age', 'Salary', 'Experience']
categorical_features = ['Education', 'City']

# Step 3: Pipelines
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numerical_pipeline, numerical_features),
    ("cat", categorical_pipeline, categorical_features)
])

# Step 4: Full pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_selection", SelectKBest(score_func=f_regression, k='all')),
    ("regressor", XGBRegressor(objective='reg:squarederror', random_state=42))
])

# Step 5: Parameter Grid for XGBoost
param_grid = {
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [3, 5],
    'regressor__learning_rate': [0.01, 0.1],
    'regressor__subsample': [0.8, 1.0]
}

# Step 6: GridSearchCV
X = df.drop("Target", axis=1)
y = df["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Random_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
Random_search.fit(X_train, y_train)

# Step 7: Evaluation
best_model = Random_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Parameters:", Random_search.best_params_)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

Best Parameters: {'regressor__subsample': 0.8, 'regressor__n_estimators': 100, 'regressor__max_depth': 3, 'regressor__learning_rate': 0.1}
R² Score: 0.8172038433670109
MSE: 87575446.31760874
