# Basics Linear Regression

In [7]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

#### pandas: A library for data manipulation and analysis.
#### sklearn.linear_model: A library for linear regression models.
#### sklearn.pipeline: A library for creating pipelines of machine learning models.
#### sklearn.impute: A library for imputing missing values.
#### numpy: A library for scientific computing.

In [8]:
df = pd.DataFrame({
    "area": [2600, 3000, 3200, 2600, 4000],
    "bedrooms": [3,4, np.nan, 3, 5],
    "age": [20, 15, 18, 30, 8],
    "price": [550, 565, 610, 595, 760]
})

#### creates a DataFrame called df with the following columns:
#### area: The area of the house.
#### bedrooms: The number of bedrooms in the house.
#### age: The age of the house in years.
#### price: The price of the house.

In [9]:
df

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550
1,3000,4.0,15,565
2,3200,,18,610
3,2600,3.0,30,595
4,4000,5.0,8,760


#### Print the dataframe.

In [10]:
pipeline = Pipeline([
    ("fill_nan_values", SimpleImputer())
])

#### creates a Pipeline object called pipeline. The  SimpleImputer object used to impute missing values in the data.The SimpleImputer object is a machine learning model that can be used to fill in missing values in a dataset. The SimpleImputer object has a strategy argument that specifies how the missing values will be imputed. The default value for the strategy argument is mean, which means that the missing values will be filled in with the mean of the column. So it find the mean (3+4+3+5)/4=3.75 , then it will  replace NaN by 3.75

In [11]:
pd.DataFrame(pipeline.fit_transform(df), columns=df.columns)

Unnamed: 0,area,bedrooms,age,price
0,2600.0,3.0,20.0,550.0
1,3000.0,4.0,15.0,565.0
2,3200.0,3.75,18.0,610.0
3,2600.0,3.0,30.0,595.0
4,4000.0,5.0,8.0,760.0


 #### Creates a  DataFrame  that contains the imputed data.
#### The line pipeline.fit_transform(df) imputes the missing values in the DataFrame df using the SimpleImputer object in the pipeline object.
#### The line columns=df.columns specifies that the columns of the new DataFrame df_imputed should be the same as the columns of the original DataFrame df.

In [12]:
pipeline = Pipeline([
    ("fill_nan_values", SimpleImputer()),
    ("ML_model", LinearRegression())
])

#### creates a Pipeline object called pipeline with two steps. The first step is a SimpleImputer object called fill_nan_values. The second step is a LinearRegression object called ML_model.

In [13]:
pipeline.fit(df.drop("price", axis=1), df["price"])

Pipeline(steps=[('fill_nan_values', SimpleImputer()),
                ('ML_model', LinearRegression())])

### The line pipeline.fit(df.drop("price", axis=1), df["price"]) fits the pipeline object to the data in the DataFrame df.The line df.drop("price", axis=1) drops the price column from the DataFrame df.The line df["price"] is the target variable.

In [14]:
pipeline.predict(df.drop("price", axis=1))

array([534.21572424, 573.30751356, 643.23005422, 585.86173509,
       743.38497289])

### The line pipeline.predict(df.drop("price", axis=1)) predicts the prices of the houses in the DataFrame df.The line df.drop("price", axis=1) drops the price column from the DataFrame df.This is because the price column is the target variable, and we want to predict the prices of the houses.

In [13]:
pipeline.named_steps["fill_nan_values"].statistics_

array([3080.  ,    3.75,   18.2 ])

### Returns a dictionary of statistics for each column that was imputed. The array contains the mean, standard deviation, and median of the imputed values in the DataFrame df.