In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

: 

In [None]:
df = pd.read_csv('salary_prediction_data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum().item()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df[['Experience','Age','Salary']].corr()

In [None]:
df['Experience'].value_counts().head()

In [None]:
plt.figure(figsize=(10,3))
df['Experience'].value_counts().plot(kind='bar')

In [None]:
df['Education'].value_counts()

In [None]:
plt.figure(figsize=(5,2.5))
df.groupby('Gender')['Salary'].mean().plot(kind='bar')

In [None]:
df['Age'].mean().item()

In [None]:
df['Job_Title'].value_counts()

In [None]:
plt.figure(figsize=(5,2.5))
df.groupby('Job_Title')['Salary'].mean().plot(kind='bar')

In [None]:
df.sample()

In [None]:
df['Location'].value_counts()

In [None]:
df.info()

In [None]:
df['Gender'].value_counts()

In [None]:
plt.figure(figsize=(5,2.5))
df['Age'].plot(kind='box')

In [None]:
plt.figure(figsize=(5,2.5))
df['Salary'].plot(kind='box')

In [None]:
df['Salary'].max()

In [None]:
df[df['Salary'] > 150000]['Job_Title'].value_counts()

In [None]:
df.sample()

In [None]:

X = df.drop("Salary", axis=1)
y = df["Salary"]


In [None]:

categorical_cols = ['Education', 'Location', 'Job_Title', 'Gender']
numeric_cols = ['Experience', 'Age']



In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ],
    remainder='passthrough'
)



In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

''' A Pipeline is just a tool that connects all steps together, so you donâ€™t have to do them separately. '''

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
model.fit(X_train, y_train)


In [None]:
# Step 8: Predict and evaluate
y_pred = model.predict(X_test)
accuracy = r2_score(y_test, y_pred)

print("R2 Score:", accuracy)
print("MSE:", mean_squared_error(y_test, y_pred))

In [None]:
def predict_salary(education, experience, location, job_title, age, gender):
    # Step 1: Create a single-row DataFrame with your input
    input_data = pd.DataFrame([{
        'Education': education,
        'Experience': experience,
        'Location': location,
        'Job_Title': job_title,
        'Age': age,
        'Gender': gender
    }])

    # Step 2: Use the trained model to predict salary
    print(model.predict(input_data))
    predicted_salary = model.predict(input_data)
    

    # Step 3: Print the result
    # print(f"ðŸ’° Predicted Salary: ${predicted_salary:,.2f}")
    return predicted_salary[0].item()

In [None]:
predict_salary(
    education="Bachelor",
    experience=0,
    location="Urban",
    job_title="Engineer",
    age=30,
    gender="Male"
)


In [None]:
import pickle

In [None]:
# with open('prediction_model.pkl','wb') as f1:
#     pickle.dump(model, f1)

# with open('accuary_model.pkl','wb') as f2:
#     pickle.dump(accuracy, f2)
