# **Linear Regression using Scikit-learn library**
Mounting google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


importing required libraries

In [14]:
import sklearn
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn import metrics
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns

Load the data

In [3]:
df=pd.read_csv("/content/drive/MyDrive/kc_house_data.csv")
df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


**Clean the data:**
1. Most of the values in 'yr_renovated' column are 'zero' because those houses are not renovated since built. In such cases make yr_renovated = yr_built.
2. create columns 'age' and 'age_renovated' from 'yr_built' and 'yr_renovated' columns by substracting them from current year.
3. Drop all the unimportant columns from the dataframe. 
4. 'sqft_basement' column has a lot of zeros, which does not add much information. so, remove it.  

In [4]:
df['yr_renovated']=df['yr_renovated'].mask(df['yr_renovated'].eq(0),df['yr_built'])

In [5]:
df['age'] = 2022 - df['yr_built']
df['age_renovated'] = 2022 - df['yr_renovated']

In [6]:
df = df.drop(['id', 'date', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'waterfront', 'view'], axis = 1)

In [10]:
# the info() function describes the columns in the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21613 non-null  float64
 1   bedrooms       21613 non-null  int64  
 2   bathrooms      21613 non-null  float64
 3   sqft_living    21613 non-null  int64  
 4   sqft_lot       21613 non-null  int64  
 5   floors         21613 non-null  float64
 6   condition      21613 non-null  int64  
 7   grade          21613 non-null  int64  
 8   sqft_above     21613 non-null  int64  
 9   sqft_living15  21613 non-null  int64  
 10  sqft_lot15     21613 non-null  int64  
 11  age            21613 non-null  int64  
 12  age_renovated  21613 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 2.1 MB


In [7]:
df = df.drop(['sqft_basement'], axis = 1)
df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,grade,sqft_above,sqft_living15,sqft_lot15,age,age_renovated
0,221900.0,3,1.00,1180,5650,1.0,3,7,1180,1340,5650,67,67
1,538000.0,3,2.25,2570,7242,2.0,3,7,2170,1690,7639,71,31
2,180000.0,2,1.00,770,10000,1.0,3,6,770,2720,8062,89,89
3,604000.0,4,3.00,1960,5000,1.0,5,7,1050,1360,5000,57,57
4,510000.0,3,2.00,1680,8080,1.0,3,8,1680,1800,7503,35,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,360000.0,3,2.50,1530,1131,3.0,3,8,1530,1530,1509,13,13
21609,400000.0,4,2.50,2310,5813,2.0,3,8,2310,1830,7200,8,8
21610,402101.0,2,0.75,1020,1350,2.0,3,7,1020,1020,2007,13,13
21611,400000.0,3,2.50,1600,2388,2.0,3,8,1600,1410,1287,18,18


Convert the dataframe in to nd-array to perform matrix operations.

In [8]:
data = np.array(df, dtype=float)

In [9]:
data.shape

(21613, 13)

In [11]:
X,Y = data[:,1:], data[:,0]
Y,X

(array([221900., 538000., 180000., ..., 402101., 400000., 325000.]),
 array([[3.000e+00, 1.000e+00, 1.180e+03, ..., 5.650e+03, 6.700e+01,
         6.700e+01],
        [3.000e+00, 2.250e+00, 2.570e+03, ..., 7.639e+03, 7.100e+01,
         3.100e+01],
        [2.000e+00, 1.000e+00, 7.700e+02, ..., 8.062e+03, 8.900e+01,
         8.900e+01],
        ...,
        [2.000e+00, 7.500e-01, 1.020e+03, ..., 2.007e+03, 1.300e+01,
         1.300e+01],
        [3.000e+00, 2.500e+00, 1.600e+03, ..., 1.287e+03, 1.800e+01,
         1.800e+01],
        [2.000e+00, 7.500e-01, 1.020e+03, ..., 1.357e+03, 1.400e+01,
         1.400e+01]]))

Feature scaling using standard scaler

In [12]:
X_std = StandardScaler().fit_transform(X)

In [32]:
Y_std = StandardScaler().fit_transform(Y.reshape((len(Y),1)))

Splittting the dataset in train and test sets

In [33]:
x_train, x_test, y_train, y_test = train_test_split(X_std, Y_std, test_size=0.25, random_state=4)

**Training the model**

In [34]:
# Creating an instance of the logistic regression class
LinearRegressor = LinearRegression()

# model fitting(training)
LinearRegressor.fit(x_train, y_train)

LinearRegression()

**Evaluating the model on Test set**

In [35]:
y_preds = LinearRegressor.predict(x_test)

Mean squared Error:

In [45]:
sklearn.metrics.mean_squared_error(y_test, y_preds)

0.39555884793985535

Accuracy of the model on test set

In [38]:
score = LinearRegressor.score(x_test, y_test)
print(score)

0.6152262191857567
