# 多元线性回归

## Step 1: 数据预处理

### 导入库

In [34]:
import pandas as pd
import numpy as np
import warnings

### 导入数据集

In [35]:
dataset = pd.read_csv('data/50_Startups.csv')
X = dataset.iloc[ : , :-1].values
Y = dataset.iloc[ : ,  4 ].values

### 编码数据

In [36]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
warnings.filterwarnings('ignore')

labelencoder = LabelEncoder()
X[: , 3] = labelencoder.fit_transform(X[ : , 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

### 避免虚拟变量陷阱
**虚拟变量陷阱**指当原特征有 m 个类别时，如果将其转换成 m 个虚拟变量，就会导致变量间出现完全共线性的情况。

只需要 m-1 个虚拟变量，第 m 个虚拟变量是多余的，它没有任何新信息。

In [37]:
print(X.shape)
X = X[: , 1:]
print(X.shape)

(50, 6)
(50, 5)


### 将数据集拆分为训练集和测试集

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Step 2: 将多元线性回归拟合到训练集

In [39]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Step 3: 预测测试集结果

In [40]:
y_pred = regressor.predict(X_test)

In [41]:
print(y_pred)
print(Y_test)

[103015.20159796 132582.27760816 132447.73845175  71976.09851259
 178537.48221054 116161.24230163  67851.69209676  98791.73374688
 113969.43533012 167921.0656955 ]
[103282.38 144259.4  146121.95  77798.83 191050.39 105008.31  81229.06
  97483.56 110352.25 166187.94]
