In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load Dataset
df = pd.read_csv(r'D:\Machine Learning\Machine-Learning\Regression\Salary_Data.csv')
df.head()

In [None]:
# Total Data (Rows and Columns)
df.shape

In [None]:
# describr Dataset
df.describe()

In [None]:
# Basic Info
df.info()


In [None]:
# Checking Nulls
df.isnull().sum()

In [None]:
# Checking Nulls Through Heatmap
sns.heatmap(df.isnull(), cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
# Checking Outliers in YearsExperience Column
sns.boxplot(x='YearsExperience',data=df)
plt.show()

In [None]:
# Checking Outliers in Salary Column
sns.boxplot(x='Salary',data=df)
plt.show()

In [None]:
# scatter plot (To check Relation(Linear-Relation))
sns.scatterplot(x=df['YearsExperience'], y=df['Salary'])

# Set plot title and labels
plt.title('Scatter Plot of Years Experience and Salaries')
plt.xlabel('Years Experience')
plt.ylabel('Salary') 

# Show the plot
plt.show()

In [None]:
# Correlation 
df.corr()

In [None]:
# Compute the correlation matrix
correlation_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Years of Experience and Salaries')
plt.show()


In [None]:
# Pair-plot Corr
sns.pairplot(df)

In [None]:
plt.subplot(1,2,1)
sns.distplot(df['YearsExperience'])
plt.subplot(1,2,2)
sns.distplot(df['Salary'])

# Linear Regression

In [None]:
# Seperate the columns
X = df[['YearsExperience']]
y = df['Salary']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
# Import Libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# create model
model = LinearRegression()
model

In [None]:
# Fit model
model = model.fit(X_train, y_train)
model

In [None]:
# Plotting

plt.scatter(X_train, y_train, color='red')
plt.plot(X_train, model.predict(X_train), color='blue')
plt.title('Salary vs Experience (Training set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')

In [None]:
# Evaluating the model
test = model.score(X_test,y_test)
train = model.score(X_train,y_train)

print("Test score :",test) 
print("Train score :",train)

In [None]:
df.head()

In [80]:
# prediction of Unknown values
unknown_pred_1 = model.predict([[5]])
unknown_pred_1

array([72440.65962693])

In [81]:
unknown_pred_2 = model.predict([[2.4], [3], [6.7], [8.9]])
unknown_pred_2

array([ 47938.73978705,  53593.02898087,  88461.14567608, 109193.53938675])

In [83]:
unknown_pred_3 = [[10], [20], [30], [40]]
unknown_pred = model.predict(unknown_pred_3)
unknown_pred

array([119559.73624209, 213797.8894724 , 308036.04270271, 402274.19593302])

In [84]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)


In [85]:
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
print("Train MSE:", mse_train)
print("Test MSE:", mse_test)

Train MSE: 27102249.73126139
Test MSE: 49830096.85590839


In [86]:
# Calculate additional metrics
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("Train MSE:", mse_train)
print("Test MSE:", mse_test)
print("Train RMSE:", rmse_train)
print("Test RMSE:", rmse_test)
print("Train MAE:", mae_train)
print("Test MAE:", mae_test)
print("Train R2 Score:", r2_train)
print("Test R2 Score:", r2_test)

Train MSE: 27102249.73126139
Test MSE: 49830096.85590839
Train RMSE: 5205.982110155719
Test RMSE: 7059.04362190151
Train MAE: 4221.046734449738
Test MAE: 6286.453830757749
Train R2 Score: 0.9645401573418146
Test R2 Score: 0.9024461774180497
