# Linear Regression 

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse, make_scorer
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
#import libraries

## Train Model

In [2]:
df_train = pd.read_csv("input/train.csv")
df_test = pd.read_csv("input/test.csv")
#read csv's

In [3]:
X = df_train.drop("price", axis=1)
y = df_train["price"]
#take predictors variables and response variable

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=444)
#split the data 0.8 traon, 0.2 test

In [5]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("linear", LinearRegression())
])
#join Standarization and Linear Regression model

In [6]:
pipe.fit(X_train, y_train)
#train model

Pipeline(steps=[('scaler', StandardScaler()), ('linear', LinearRegression())])

In [7]:
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)
#predict y

In [8]:
mse_train = mse(y_train, y_pred_train)
mse_test = mse(y_test, y_pred_test)
#compute mean square error

In [9]:
df_pred = pd.DataFrame(y_train)
df_pred["y_pred"] = y_pred_train
df_pred["mse"] = mse_train
#display

In [10]:
df_pred.head()
#check data

Unnamed: 0,price,y_pred,mse
29816,9.395,9.416554,0.044017
14427,7.21,7.163416,0.044017
39546,9.321,9.113831,0.044017
7814,6.964,6.98122,0.044017
13427,7.671,7.544851,0.044017


In [11]:
mse_test

0.038032302480964494

In [12]:
score_mse = make_scorer(mse)
cross_validate(estimator=LinearRegression(), X=X, y=y, scoring=score_mse, cv=10)
#check various tests

{'fit_time': array([0.01695538, 0.0159595 , 0.01196527, 0.01296926, 0.01296687,
        0.01396203, 0.0129652 , 0.01396227, 0.00997257, 0.01197028]),
 'score_time': array([0.00398874, 0.00399375, 0.00199437, 0.0039866 , 0.0029912 ,
        0.00199437, 0.00199533, 0.00299144, 0.00299215, 0.00299096]),
 'test_score': array([0.0310712 , 0.07243709, 0.03219357, 0.0299587 , 0.03208138,
        0.03085788, 0.06228444, 0.03323898, 0.03147637, 0.08168097])}

# Test

In [13]:
pipe.fit(X, y)
#train model

Pipeline(steps=[('scaler', StandardScaler()), ('linear', LinearRegression())])

In [14]:
df_test.head()
#check data

Unnamed: 0,carat,cut,color,clarity,table,x
0,0.51,4,1,2,61.9,5.19
1,1.02,4,3,2,58.0,6.37
2,0.59,3,2,4,56.0,5.39
3,0.9,4,2,4,56.0,6.14
4,2.01,5,7,5,61.0,8.23


In [15]:
test_pred = pipe.predict(df_test)
#predict y

In [16]:
test = pd.DataFrame(df_test.index, columns=["id"])
test["price"] = test_pred
#display dataframe

In [17]:
test.head()
#check data

Unnamed: 0,id,price
0,0,7.207961
1,1,8.22071
2,2,7.609115
3,3,8.323555
4,4,9.925072


In [18]:
test.to_csv("linearregression.csv", index = False, header = True)
#save csv