In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def linear_regression(X, y, iters=1000, learning_rate=0.01):
    X = np.hstack((np.ones((X.shape[0], 1)), X))  # Add intercept term
    theta = np.zeros((X.shape[1], 1))

    for _ in range(iters):
        predictions = X.dot(theta)
        errors = predictions - y.reshape(-1, 1)
        gradient = (1 / X.shape[0]) * X.T.dot(errors)
        theta -= learning_rate * gradient

    return theta

data = pd.read_csv('50_Startups.csv', header=0)

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

ct = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(), [3])
], remainder='passthrough')

X = ct.fit_transform(X)

y = y.astype(float)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

theta = linear_regression(X_scaled, y, iters=1000, learning_rate=0.01)

new_data = np.array([165349.2, 136897.8, 471784.1, 'New York']).reshape(1, -1)  # Example new data
new_data_scaled = scaler.transform(ct.transform(new_data))

new_prediction = np.dot(np.append(1, new_data_scaled), theta)

print(f"Predicted value: {new_prediction[0]}")
data.head()


Predicted value: 193075.97426510364


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
