In [19]:
# TP3.2: LINEAR REGRESSION - Startup Profit Prediction (Simplified Version)
# Purpose: Same as TP3.1 but with cleaner code structure
# Method: Multiple Linear Regression with preprocessing
# Key difference: More concise, no outlier visualization

# Import necessary libraries
import pandas as pd  # For data manipulation
from sklearn.metrics import r2_score  # For model evaluation

# STEP 1: Load the dataset
dataset = pd.read_csv("C:/Users/ASUS/Downloads/Startups.csv")
print(dataset)  # Display dataset


    R&D Spend  Administration  Marketing Spend       State     Profit
0   165349.20       136897.80        471784.10    New York  192261.83
1   162597.70       151377.59        443898.53  California  191792.06
2   153441.51       101145.55        407934.54     Florida  191050.39
3   144372.41       118671.85        383199.62    New York  182901.99
4   142107.34        91391.77        366168.42     Florida  166187.94
5   131876.90        99814.71        362861.36    New York  156991.12
6   134615.46       147198.87        127716.82  California  156122.51
7   130298.13       145530.06        323876.68     Florida  155752.60
8   120542.52       148718.95        311613.29    New York  152211.77
9   123334.88       108679.17        304981.62  California  149759.96
10  101913.08       110594.11        229160.95     Florida  146121.95
11  100671.96        91790.61        249744.55  California  144259.40
12   93863.75       127320.38        249839.44     Florida  141585.52
13   91992.39       

In [18]:
# STEP 2: Prepare features (X) and target (y)
X = dataset.iloc[:, :-1]  # All columns except last (features: R&D, Admin, Marketing, State)
y = dataset.iloc[:, -1]   # Last column (target: Profit)

# STEP 3: Encode categorical variable (State)
# get_dummies: converts categorical 'State' to numerical dummy variables
# drop_first=True: avoids multicollinearity (drops one category as reference)
# dtype=int: ensures dummy variables are integers (0 or 1)
X = pd.get_dummies(X, drop_first=True, dtype=int)
X.head()  # Display first 5 rows


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [5]:
# STEP 4: Split data into training and testing sets
from sklearn.model_selection import train_test_split

# test_size=0.2: 20% for testing, 80% for training
# random_state=0: ensures reproducible results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# STEP 5: Feature scaling (standardization)
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()  # Create scaler object
cols = ['R&D Spend', 'Administration', 'Marketing Spend']  # Columns to scale

# Important: fit only on training data to avoid data leakage
sc = sc.fit(X_train[cols])  # Calculate mean and std from training data
X_train[cols] = sc.transform(X_train[cols])  # Apply scaling to training data
X_test[cols] = sc.transform(X_test[cols])    # Apply same scaling to test data
X_train.head()  # Display scaled training data


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
33,-0.350065,-0.785471,0.101197,1,0
35,-0.555303,-1.481174,0.02735,0,1
26,0.079358,0.801334,-0.551521,1,0
34,-0.546382,1.325058,0.070117,0,0
18,0.434854,-0.355987,0.751485,1,0


In [6]:
# STEP 6: Verify standardization
# Mean should be ≈ 0 and standard deviation should be ≈ 1
print(X_train[cols].mean())  # Check means (should be close to 0)
print(X_train[cols].std())   # Check standard deviations (should be close to 1)


R&D Spend         -1.665335e-16
Administration    -2.220446e-17
Marketing Spend    4.440892e-17
dtype: float64
R&D Spend          1.012739
Administration     1.012739
Marketing Spend    1.012739
dtype: float64


In [10]:
# STEP 7: Train Linear Regression model
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()  # Create model
regressor.fit(X_train, y_train)  # Train model on training data

# Display model parameters
print(regressor.coef_)  # Coefficients (weights for each feature)
print(regressor.intercept_)  # Intercept (bias term)
print(regressor.score(X_train, y_train))  # R² score on training data


[35726.28774249   851.30163448  4519.88277698  -959.28416006
   699.36905252]
109441.48912163253
0.9501847627493607


In [20]:
# STEP 8: Evaluate model on test data
y_pred = regressor.predict(X_test)  # Make predictions on test data
r2_score(y_test, y_pred)  # R² score on test data (measures prediction accuracy)


0.9347068473282423

In [22]:
# STEP 9: Make prediction for new startup
# Example: R&D=140000, Admin=10000, Marketing=35000, State=California (0,0)
df = pd.DataFrame(data=[[140000, 10000, 35000, 0, 0]], columns=X.columns)
df[cols] = sc.transform(df[cols])  # Apply same scaling as training data
print(regressor.predict(df))  # Predict profit for new startup


[152449.77148092]
