In [1]:
#Predictive Modeling for Startup Profitability using Multiple Linear Regression
#◦ Develop a multiple linear regression model that predicts startup profitability by leveraging crucial features including R
#& D spending, administration costs, marketing expenses, and state locations
#◦ Employed Pandas, NumPy, scikit-learn, and Matplotlib
#◦ Applied one-hot encoding for categorical features and achieved accurate profit predictions, demonstrating proficiency in
#machine learning and data analysis

# Multiple Linear Regression

## Importing the libraries

In [2]:
import numpy as np  # Import the NumPy library for numerical operations.
import matplotlib.pyplot as plt  # Import the Matplotlib library for data visualization.
import pandas as pd  # Import the Pandas library for data manipulation and analysis.

## Importing the dataset

In [3]:
dataset = pd.read_csv('Startup_Profitability_data .csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [4]:
print(X)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

## Encoding categorical data

In [5]:
from sklearn.compose import ColumnTransformer  # Import ColumnTransformer from scikit-learn for column-wise transformations.
from sklearn.preprocessing import OneHotEncoder  # Import OneHotEncoder from scikit-learn for one-hot encoding.

ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [3])],  # Apply OneHotEncoder to the fourth column (index 3).
    remainder='passthrough'  # Keep the remaining columns unchanged.
)

# Apply the ColumnTransformer to the feature matrix X and convert it to a NumPy array.
X = np.array(ct.fit_transform(X))


In [6]:
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split  # Import train_test_split from scikit-learn for dataset splitting.

# Split the dataset into training and testing sets.
# X_train: Training features, X_test: Testing features
# y_train: Corresponding labels for training, y_test: Corresponding labels for testing
# test_size specifies the proportion of data to be used for testing (here, 20%).
# random_state ensures reproducibility of the split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


## Training the Multiple Linear Regression model on the Training set

In [8]:
from sklearn.linear_model import LinearRegression  # Import LinearRegression from scikit-learn.

regressor = LinearRegression()  # Create a LinearRegression object as the regressor.

# Fit the regressor on the training data.
regressor.fit(X_train, y_train)


## Predicting the Test set results

In [9]:
# Assuming 'regressor' is your trained LinearRegression model and 'X_test' contains the testing features.
# The following code predicts the target variable using the trained model and compares it with the actual values.

# Predict the target variable using the trained LinearRegression model on the testing features.
y_pred = regressor.predict(X_test)

# Set the printing options to display two decimal places for clarity.
np.set_printoptions(precision=2)

# Concatenate the predicted values and the actual values for comparison.
comparison_result = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1)

print(comparison_result)


[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]
