# Linear Regression 

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse, make_scorer
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
#import libraries

## Train Model

In [2]:
df_train = pd.read_csv("data/train_clean.csv")
df_test = pd.read_csv("data/test_clean.csv")
#read csv's

In [3]:
X = df_train.drop("price", axis=1)
y = df_train["price"]
#take predictors variables and response variable

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=444)
#split the data 0.8 traon, 0.2 test

In [5]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("linear", LinearRegression())
])
#join Standarization and Linear Regression model

In [6]:
pipe.fit(X_train, y_train)
#train model

Pipeline(steps=[('scaler', StandardScaler()), ('linear', LinearRegression())])

In [7]:
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)
#predict y

In [8]:
mse_train = mse(y_train, y_pred_train)
mse_test = mse(y_test, y_pred_test)
#compute mean square error

In [9]:
df_pred = pd.DataFrame(y_train)
df_pred["y_pred"] = y_pred_train
df_pred["mse"] = mse_train
#display

In [10]:
df_pred.head()
#check data

Unnamed: 0,price,y_pred,mse
29816,9.395,9.362677,0.040597
14427,7.21,7.126221,0.040597
39546,9.321,9.216545,0.040597
7814,6.964,7.071335,0.040597
13427,7.671,7.609855,0.040597


In [11]:
mse_test

0.034406289459589665

In [12]:
score_mse = make_scorer(mse)
cross_validate(estimator=LinearRegression(), X=X, y=y, scoring=score_mse, cv=10)
#check various tests

{'fit_time': array([0.02493834, 0.01196647, 0.01196575, 0.00997448, 0.00997663,
        0.00997543, 0.01296878, 0.00997591, 0.01200032, 0.01196885]),
 'score_time': array([0.00298691, 0.00301933, 0.00199413, 0.00199366, 0.00199413,
        0.00298977, 0.00199056, 0.00298977, 0.00295758, 0.00299191]),
 'test_score': array([0.02665961, 0.0719899 , 0.03206491, 0.02523621, 0.36610949,
        0.02673133, 0.06010793, 0.0280371 , 0.02697135, 0.08241436])}

# Test

In [13]:
pipe.fit(X, y)
#train model

Pipeline(steps=[('scaler', StandardScaler()), ('linear', LinearRegression())])

In [14]:
df_test.head()
#check data

Unnamed: 0,carat,cut,color,clarity,depth,x,y,z
0,0.51,4,1,2,58.3,5.19,5.2,3.04
1,1.02,4,3,2,63.0,6.37,6.43,4.03
2,0.59,3,2,4,61.9,5.39,5.34,3.32
3,0.9,4,2,4,62.3,6.14,6.18,3.84
4,2.01,5,7,5,60.2,8.23,8.16,4.93


In [15]:
test_pred = pipe.predict(df_test)
#predict y

In [16]:
mse_test_t = mse(y_test, y_pred_test)

In [17]:
test = pd.DataFrame(df_test.index, columns=["id"])
test["price"] = test_pred
#display dataframe

In [18]:
test.head()
#check data

Unnamed: 0,id,price
0,0,7.09624
1,1,8.28721
2,2,7.604308
3,3,8.355495
4,4,9.875326


In [19]:
test.to_csv("output/Linearregression.csv", index = False, header = True)
#save csv