In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load Dataset
file_path = 'Salary.csv'  # Replace with your file path
data = pd.read_csv('Salary.csv')

# Check data structure
print("Dataset Preview:\n", data.head())

# Features and Target
X = data[['experience', 'qualifications', 'industry', 'location', 'company_size']]  # Added 'company_size'
y = data['salary']  # Target variable

# Preprocessing: One-Hot Encoding for Categorical Features
categorical_features = ['qualifications', 'industry', 'location', 'company_size']  # Added 'company_size'
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_features)],
    remainder='passthrough')

X = preprocessor.fit_transform(X)

# Splitting Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on Test Set
y_pred = model.predict(X_test)

# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Score:", r2)


Dataset Preview:
    experience qualifications   industry  location company_size  salary
0           5      Bachelors         IT  New York       Medium   70000
1           3        Masters    Finance    London        Small   60000
2          10           Ph D         IT  New York        Large  100000
3           2      Bachelors    Finance        US        Small  100000
4           4        Masters  Marketing  New York       Medium   30000
Mean Squared Error: 15006673999999.996
R-squared Score: -3061.586530612244
