# Regression Analysis: Exploration of data

### This notebook is to exlore, understand and briefly visualise the original dataset to better understand the project.

### Start off by importing source of data and libraries

In [11]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

### Convert csv data to MySQL

In [12]:
data = pd.read_csv("regression_data.csv")


### Making the data set usable by clarrifying the data clasification 

In [13]:
data.columns = ['id', 'date', 'bedrooms', 'bathrooms','sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 
                'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renov', 'zip_code', 'lat', 
                'long', 'sqft_living15', 'sqft_lot15', 'price']
data.head(10)

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renov,zip_code,lat,long,sqft_living15,sqft_lot15,price
0,6414100192,12/9/14,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,538000
1,5631500400,2/25/15,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,180000
2,2487200875,12/9/14,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,604000
3,1954400510,2/18/15,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,510000
4,7237550310,5/12/14,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930,1230000
5,1321400060,6/27/14,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819,257500
6,2008000270,1/15/15,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711,291850
7,2414600126,4/15/15,3,1.0,1780,7470,1.0,0,0,3,7,1050,730,1960,0,98146,47.5123,-122.337,1780,8113,229500
8,3793500160,3/12/15,3,2.5,1890,6560,2.0,0,0,3,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570,323000
9,1736800520,4/3/15,3,2.5,3560,9796,1.0,0,0,3,8,1860,1700,1965,0,98007,47.6007,-122.145,2210,8925,662500


In [14]:
data.to_csv(r'regression_data_updated.csv')

#mysqlData = data.to_sql()

### Exploring the dataset to clarify data size and type

In [15]:
data. shape

(21596, 21)

In [16]:
data.dtypes

id                 int64
date              object
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renov           int64
zip_code           int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
price              int64
dtype: object

In [17]:
data.isna().sum() #no NaN values

id               0
date             0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renov         0
zip_code         0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
price            0
dtype: int64

### Exploring the values of individual columns

In [18]:
data['bedrooms'].value_counts()

3     9823
4     6882
2     2760
5     1601
6      272
1      196
7       38
8       13
9        6
10       3
11       1
33       1
Name: bedrooms, dtype: int64

In [19]:
data['waterfront'].value_counts()

0    21433
1      163
Name: waterfront, dtype: int64

In [20]:
data['view'].value_counts()

0    19474
2      961
3      510
1      332
4      319
Name: view, dtype: int64

In [21]:
data['condition'].value_counts() # Overall condition 1 indicates poor condition and 5 excellent.

3    14019
4     5677
5     1701
2      170
1       29
Name: condition, dtype: int64

In [22]:
data['grade'].value_counts()

7     8973
8     6065
9     2615
6     2038
10    1134
11     399
5      242
12      89
4       27
13      13
3        1
Name: grade, dtype: int64

In [23]:
data['yr_built'].value_counts()

2014    559
2006    453
2005    450
2004    433
2003    420
       ... 
1933     30
1901     29
1902     27
1935     24
1934     21
Name: yr_built, Length: 116, dtype: int64

In [24]:
data['price'].head(10)

0     538000
1     180000
2     604000
3     510000
4    1230000
5     257500
6     291850
7     229500
8     323000
9     662500
Name: price, dtype: int64

In [25]:
data['price'].list.sort(reverse=True)

AttributeError: 'Series' object has no attribute 'list'

### Using scatter plots to visualise the data, usefull to identify outlying data and patterns

In [None]:
sns.scatterplot(data=data, x='price', y='sqft_living')

In [None]:
sns.scatterplot(data=data, x='price', y='bedrooms')

In [None]:
sns.scatterplot(data=data, x='price', y='sqft_lot')

In [None]:
sns.scatterplot(data=data, x='price', y='waterfront')

In [None]:
sns.scatterplot(data=data, x='price', y='zip_code')

In [None]:
sns.scatterplot(data=data, x='price', y='condition')