In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import joblib
import os


sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")
%matplotlib inline


os.makedirs('visualizations', exist_ok=True)
os.makedirs('models', exist_ok=True)


df = pd.read_csv("data/Salary_Data.csv")
print("Data shape:", df.shape)
df.head()


print("\nData Info:")
df.info()

print("\nStatistical Summary:")
display(df.describe())


plt.figure(figsize=(10,6))
sns.histplot(df['Salary'], kde=True, bins=8, color='green')
plt.title('Salary Distribution')
plt.savefig('visualizations/salary_dist.png')
plt.show()


plt.figure(figsize=(10,6))
sns.scatterplot(x='YearsExperience', y='Salary', data=df, s=100)
plt.title('Experience vs Salary')
plt.savefig('visualizations/exp_vs_salary.png')
plt.show()


plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.savefig('visualizations/correlation.png')
plt.show()


X = df[['YearsExperience']]
y = df['Salary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)



print(f"\nModel Equation: Salary = {model.coef_[0]:.2f}*Experience + {model.intercept_:.2f}")

y_pred = model.predict(X_test)

results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
display(results)

print("\nMetrics:")
print(f"MAE: {metrics.mean_absolute_error(y_test, y_pred):.2f}")
print(f"R²: {metrics.r2_score(y_test, y_pred):.2f}")


joblib.dump(model, 'models/salary_predictor.pkl')
print("Model saved!")