# Linear Regression 

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse, make_scorer
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
#import libraries

## Train Model

In [2]:
df_train = pd.read_csv("data/train_clean.csv")
df_test = pd.read_csv("data/test_clean.csv")
#read csv's

In [3]:
X = df_train.drop("price", axis=1)
y = df_train["price"]
#take predictors variables and response variable

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=444)
#split the data 0.8 traon, 0.2 test

In [5]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("linear", LinearRegression())
])
#join Standarization and Linear Regression model

In [6]:
pipe.fit(X_train, y_train)
#train model

Pipeline(steps=[('scaler', StandardScaler()), ('linear', LinearRegression())])

In [7]:
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)
#predict y

In [8]:
mse_train = mse(y_train, y_pred_train)
mse_test = mse(y_test, y_pred_test)
#compute mean square error

In [9]:
df_pred = pd.DataFrame(y_train)
df_pred["y_pred"] = y_pred_train
df_pred["mse"] = mse_train
#display

In [10]:
df_pred.head()
#check data

Unnamed: 0,price,y_pred,mse
29816,9.395,9.362512,0.040597
14427,7.21,7.126222,0.040597
39546,9.321,9.216463,0.040597
7814,6.964,7.071258,0.040597
13427,7.671,7.609838,0.040597


In [11]:
mse_test

0.034405338931505096

In [13]:
score_mse = make_scorer(mse)
cross_validate(estimator=LinearRegression(), X=X, y=y, scoring=score_mse, cv=10)
#check various tests

{'fit_time': array([0.02593112, 0.01396251, 0.01296544, 0.01200271, 0.01097178,
        0.0119729 , 0.01196527, 0.01299953, 0.01193666, 0.01296639]),
 'score_time': array([0.00398946, 0.00199437, 0.00199795, 0.00298667, 0.00199413,
        0.00199842, 0.00199056, 0.00298667, 0.00199199, 0.00199223]),
 'test_score': array([0.02665487, 0.07198371, 0.03205515, 0.02523109, 0.37110949,
        0.02672819, 0.06010965, 0.02804165, 0.02698133, 0.08245384])}

# Test

In [14]:
pipe.fit(X, y)
#train model

Pipeline(steps=[('scaler', StandardScaler()), ('linear', LinearRegression())])

In [15]:
df_test.head()
#check data

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.51,4,1,2,58.3,61.9,5.19,5.2,3.04
1,1.02,4,3,2,63.0,58.0,6.37,6.43,4.03
2,0.59,3,2,4,61.9,56.0,5.39,5.34,3.32
3,0.9,4,2,4,62.3,56.0,6.14,6.18,3.84
4,2.01,5,7,5,60.2,61.0,8.23,8.16,4.93


In [16]:
test_pred = pipe.predict(df_test)
#predict y

In [20]:
mse_test_t = mse(y_test, y_pred_test)

array([7.09473568, 8.28694673, 7.60491755, ..., 6.71805479, 6.58909091,
       8.04255496])

In [17]:
test = pd.DataFrame(df_test.index, columns=["id"])
test["price"] = test_pred
#display dataframe

In [18]:
test.head()
#check data

Unnamed: 0,id,price
0,0,7.094736
1,1,8.286947
2,2,7.604918
3,3,8.356289
4,4,9.874331


In [19]:
test.to_csv("linearregression.csv", index = False, header = True)
#save csv