#### Import Libraries and Load the Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression


In [2]:
data = pd.read_csv('Startups.csv')

In [3]:
data

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,California,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,California,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,New York,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


#### Preprocess the Data

In [4]:
# Convert categorical columns to numerical
for column in data.columns:
    data['State'], _ = pd.factorize(data['State'])

# View the converted data
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,0,192261.83
1,162597.7,151377.59,443898.53,1,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,0,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


#### Simple Linear Regression

In [5]:
for column in data.columns[:-1]:  # Excluding the dependent variable
    x = data[[column]]
    y = data['Profit']
    model = sm.OLS(y, sm.add_constant(x)).fit()
    print(f"Summary for {column}:\n", model.summary())


Summary for R&D Spend:
                             OLS Regression Results                            
Dep. Variable:                 Profit   R-squared:                       0.947
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     849.8
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           3.50e-32
Time:                        20:39:53   Log-Likelihood:                -527.44
No. Observations:                  50   AIC:                             1059.
Df Residuals:                      48   BIC:                             1063.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.903e+04   2537

#### Multiple Linear Regression

In [6]:
x = data[['R&D Spend', 'Administration', 'Marketing Spend']]  # Example
y = data['Profit']
x1 = sm.add_constant(x)
model = sm.OLS(y, x1).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                 Profit   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     296.0
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           4.53e-30
Time:                        20:41:40   Log-Likelihood:                -525.39
No. Observations:                  50   AIC:                             1059.
Df Residuals:                      46   BIC:                             1066.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const            5.012e+04   6572.353     

#### Make Predictions

In [7]:
predictions = model.predict(x1)
