In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, chi2

df = pd.read_csv('dataset/Salary Data.csv')

In [25]:
df.isna().sum()

Age                    2
Gender                 2
Education Level        2
Job Title              2
Years of Experience    2
Salary                 2
dtype: int64

In [26]:
df = df.dropna()

In [27]:
y = df['Salary']
X = df.drop('Salary', axis=1)

In [28]:
numeric_features = ["Age", "Years of Experience"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["Gender", "Education Level", "Job Title"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [30]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("regression", LinearRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(f'Mean squared error: {mean_squared_error(y_test, y_pred)}')
print(f'r2_score: {r2_score(y_test, y_pred)}')

Mean squared error: 197218198.16340378
r2_score: 0.9029595065829987
