#### Applying Linear Regression Model without using python libraries

    This project tries to apply the linear regression model and prediction using the 'Salary' without using the sklearn library or numpy library

*For reading the data file pandas library is used.*

In [1]:
import pandas as pd

In [2]:
#reading the data 
data=pd.read_csv("D:\Beinex\Python\Dataset-Kaggle\Task_26-06\Salary_Data.csv")
data

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0


##### Understanding data and its distribution

In [3]:
data.describe()

Unnamed: 0,Age,Years of Experience,Salary
count,6702.0,6701.0,6699.0
mean,33.620859,8.094687,115326.964771
std,7.614633,6.059003,52786.183911
min,21.0,0.0,350.0
25%,28.0,3.0,70000.0
50%,32.0,7.0,115000.0
75%,38.0,12.0,160000.0
max,62.0,34.0,250000.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6704 entries, 0 to 6703
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  6702 non-null   float64
 1   Gender               6702 non-null   object 
 2   Education Level      6701 non-null   object 
 3   Job Title            6702 non-null   object 
 4   Years of Experience  6701 non-null   float64
 5   Salary               6699 non-null   float64
dtypes: float64(3), object(3)
memory usage: 314.4+ KB


##### Data Cleaning and feature Engineering

In [5]:
#checking whether there is any na values in the data
data.isna().sum()

Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64

In [6]:
#drop rows where all values are na
data=data.dropna(how='all',axis=0)
#checking the na values in data after dropping 
data.isna().sum()

Age                    0
Gender                 0
Education Level        1
Job Title              0
Years of Experience    1
Salary                 3
dtype: int64

In [7]:
df=data.copy()

In [8]:
#found multiple category labels, combined and mapped them
map_var={"High School":1,"Bachelor's Degree":2,"Bachelor's":2,"Master's Degree":3,"Master's":3,"PhD":4,"phD":4}
df['Education Level'] = df['Education Level'].map(map_var)
df['Education Level'].unique()
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,2.0,Software Engineer,5.0,90000.0
1,28.0,Female,3.0,Data Analyst,3.0,65000.0
2,45.0,Male,4.0,Senior Manager,15.0,150000.0
3,36.0,Female,2.0,Sales Associate,7.0,60000.0
4,52.0,Male,3.0,Director,20.0,200000.0


In [9]:
#dropping the rows where still na values are present 
df.dropna(axis=0,inplace=True)
df.isna().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

In [10]:
#Using map function inspite of label encoding 
#Encode the Gender column and Job Title column
df['Gender'] = df['Gender'].map({'Male':1,'Female':2,'Others':3})

#Encode the Job Title column
df['Job Title'] = df['Job Title'].map({title: i for i, title in enumerate(df['Job Title'].unique())})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6698 entries, 0 to 6703
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  6698 non-null   float64
 1   Gender               6684 non-null   float64
 2   Education Level      6698 non-null   float64
 3   Job Title            6698 non-null   int64  
 4   Years of Experience  6698 non-null   float64
 5   Salary               6698 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 366.3 KB


In [11]:
from sklearn.preprocessing import MinMaxScaler
X=['Age','Job Title','Years of Experience']
def scale(df,cols):
    df[cols] = MinMaxScaler().fit_transform(df[cols])

#Scale the selected columns in the original DataFrame
scale(df,X)
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6698 entries, 0 to 6703
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  6698 non-null   float64
 1   Gender               6684 non-null   float64
 2   Education Level      6698 non-null   float64
 3   Job Title            6698 non-null   float64
 4   Years of Experience  6698 non-null   float64
 5   Salary               6698 non-null   float64
dtypes: float64(6)
memory usage: 366.3 KB


##### Applying LR Model

In [12]:
#Assigning the dependent and independent variables 
X=df[['Age','Education Level','Job Title','Years of Experience']]
Y=df['Salary']

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, train_size=0.7,test_size=0.3, random_state=33)

In [13]:
#Y_train = Y_train.values.reshape(-1, 1)
#Y_test = Y_test.values.reshape(-1, 1)

In [14]:
print("Shape of X_train :", X_train.shape)
print("Shape of Y_train :", Y_train.shape)
print("Shape of X_test :", X_test.shape)
print("Shape of Y_test :", Y_test.shape)

Shape of X_train : (4688, 4)
Shape of Y_train : (4688,)
Shape of X_test : (2010, 4)
Shape of Y_test : (2010,)


In [15]:
# Assigning the dependent and independent variables
X_train = X_train.values.tolist()
Y_train = Y_train.values.tolist()
X_test = X_test.values.tolist()

In [16]:
def regression(X_train, Y_train, X_test):
    #Calculate the means of the independent variables
    mean_X_train=[sum(x)/len(x) for x in zip(*X_train)]
    
    #Calculate the mean of the dependent variable
    mean_Y_train=sum(Y_train)/len(Y_train)
    
    #Calculate the differences from the means
    diff_X_train=[[x[i]-mean_X_train[i] for i in range(len(x))] for x in X_train]
    diff_Y_train=[y-mean_Y_train for y in Y_train]
    
    # Calculate the sums of the products
    diff_X_train_Y_train=[sum(diff_X_train[i][j] * diff_Y_train[i] for i in range(len(X_train))) for j in range(len(mean_X_train))]
    diff_X_train_X_train=[[sum(diff_X_train[i][j] * diff_X_train[i][k] for i in range(len(X_train))) for j in range(len(mean_X_train))] 
                              for k in range(len(mean_X_train))]
    
    #Calculate the coefficients
    num_samples=len(X_train)
    coefficients=[0.0]*(len(mean_X_train) + 1)
    
    for i in range(len(mean_X_train)):
        coefficients[i+1] = (diff_X_train_Y_train[i] - coefficients[0] * diff_X_train_X_train[i][0]) / diff_X_train_X_train[i][i]
    
    coefficients[0] = mean_Y_train - sum(coefficients[i+1] * mean_X_train[i] for i in range(len(mean_X_train)))
    
    #Make predictions on the test data
    Y_pred = [coefficients[0] + sum(coefficients[i+1] * X_test[j][i] for i in range(len(mean_X_train))) for j in range(len(X_test))]
    
    return Y_pred

# Call the regression function
Y_pred=regression(X_train, Y_train, X_test)


In [17]:
print(Y_pred)
print(Y_test)

[30519.913004901493, 113380.29849119266, 35692.971100875235, 52526.9160501837, 25098.11530387675, 134796.63795939888, 80108.08731130959, 314540.23766598536, 2566.1234295770846, -25974.518068160993, 87290.46366432408, 81797.93809861006, 73886.79031306738, 49347.08929988707, 35692.971100875235, 324851.33993898553, -25620.318839434578, 52526.9160501837, 271347.56044351874, 21011.96916142611, 123738.56030223524, 92210.84086297828, 90439.34742554824, 108779.48488682281, 186617.822320698, 241406.18605749006, -50123.67897546866, 113380.29849119266, 90439.34742554824, 40366.08817503232, 35692.971100875235, 14781.462620447885, 19839.851550010135, 323777.578820467, 281945.9792482806, 134740.17275407634, 80117.29685404562, 165533.94863488176, 9114.488322135483, 145967.05260886558, 43353.04710725846, 201774.81867081166, 274879.8811798377, 141103.27025314022, 311562.23962959624, 80117.29685404562, 105378.29851278439, 110347.7891390842, 85362.78805741286, 190546.08105096332, 39193.97056361631, 87038

In [18]:
def calculate_mae(Y_test, Y_pred):
    n = len(Y_test)
    mae = sum(abs(Y_test[i] - Y_pred[i]) for i in range(n)) / n
    return mae

Y_test = list(Y_test)
Y_pred = list(Y_pred)

mae = calculate_mae(Y_test, Y_pred)
mae_percentage = (mae / max(Y_test)) * 100
print("Mean Absolute Error Percentage:", mae_percentage)
print("The accuracy in prediction: ", 100-mae_percentage)

Mean Absolute Error Percentage: 22.517318843633515
The accuracy in prediction:  77.48268115636648
