In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn import metrics


df = pd.read_csv('/content/MFGEmployees4.csv')

selected_features = ["Gender", "City", "JobTitle", "LengthService", "DepartmentName", "Division", "Age", "BusinessUnit"]
df_selected = df[selected_features]

X = df_selected
y = df["AbsentHours"]

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [None]:
# Creating a column transformer
ct = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features),
    remainder="drop"
)

pipeline = make_pipeline(
    ct,
    LinearRegression()
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

# Fit the pipeline with X and y data
pipeline.fit(X_train, y_train)

y_pred_mlr = pipeline.predict(X_test)

print('Mean Absolute Error of mlr model:', metrics.mean_absolute_error(y_test, y_pred_mlr))
print('Mean Squared Error of mlr model:', metrics.mean_squared_error(y_test, y_pred_mlr))
print('Root Mean Squared Error of mlr model:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr)))
print('R2 score of mlr model:', r2_score(y_test, y_pred_mlr))

Mean Absolute Error of mlr model: 20.767119902476562
Mean Squared Error of mlr model: 693.3532058105454
Root Mean Squared Error of mlr model: 26.331600897221296
R2 score of mlr model: 0.7127034160664812


In [None]:
# Creating a column transformer
ct = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features),
    remainder="drop"
)

# Create a pipeline with the column transformer and the regressor
pipeline = make_pipeline(ct, RandomForestRegressor(n_estimators=10, random_state=0))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

# Fit the pipeline with X and y data
pipeline.fit(X_train, y_train)

y_pred_rfr= pipeline.predict(X_test)
print('Mean Absolute Error of rfr model:', metrics.mean_absolute_error(y_test, y_pred_rfr))
print('Mean Squared Error of rfr model:', metrics.mean_squared_error(y_test, y_pred_rfr))
print('Root Mean Squared Error of rfr model:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_rfr)))
print('R2 score of rfr model:', r2_score(y_test, y_pred_rfr))

Mean Absolute Error of rfr model: 19.56915553754678
Mean Squared Error of rfr model: 661.9906872742285
Root Mean Squared Error of rfr model: 25.729179685217883
R2 score of rfr model: 0.7256987326865323


In [None]:
ct = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features),  # Set sparse=False
    remainder="passthrough"  # Keep any remaining columns as they are
)

# Create a PCA transformer
pca = PCA(n_components=7)  # You can adjust the number of components as needed

# Create a pipeline with PCA and MLR
pipeline = make_pipeline(
    ct,
    pca,
    LinearRegression()
)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred_mlrpca = pipeline.predict(X_test)
print('Mean Absolute Error of mlr with pca model:', metrics.mean_absolute_error(y_test, y_pred_mlrpca))
print('Mean Squared Error of mlr with pca model:', metrics.mean_squared_error(y_test, y_pred_mlrpca))
print('Root Mean Squared Error of mlr with pca model:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlrpca)))
print('R2 score of mlr with pca model:', r2_score(y_test, y_pred_mlrpca))




Mean Absolute Error of mlr with pca model: 20.2422848933139
Mean Squared Error of mlr with pca model: 657.3262822338991
Root Mean Squared Error of mlr with pca model: 25.638375187088183
R2 score of mlr with pca model: 0.7276314671470334


In [None]:
# Create a column transformer
ct = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore',sparse=False), categorical_features),
    remainder="passthrough"
)

# Create a PCA transformer
pca = PCA(n_components=8)

# Create a pipeline with the column transformer, PCA, and the regressor
pipeline = make_pipeline(
    ct,
    pca,
    RandomForestRegressor(n_estimators=10, random_state=0)
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

# Fit the pipeline with X and y data
pipeline.fit(X_train, y_train)
# Make predictions on the test set
y_pred_rfrpca = pipeline.predict(X_test)

print('Mean Absolute Error of rfr with pca model:', metrics.mean_absolute_error(y_test, y_pred_rfrpca))
print('Mean Squared Error of rfr with pca model:', metrics.mean_squared_error(y_test, y_pred_rfrpca))
print('Root Mean Squared Error of rfr with pca model:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_rfrpca)))
print('R2 score of rfr with pca model:', r2_score(y_test, y_pred_rfrpca))



Mean Absolute Error of rfr with pca model: 20.01116773832743
Mean Squared Error of rfr with pca model: 692.1667307870007
Root Mean Squared Error of rfr with pca model: 26.309061761815084
R2 score of rfr with pca model: 0.7131950417174917
