In [None]:
#importing necessary libraries I will be using 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
plt.style.use('seaborn')
%matplotlib inline 

#pulling database and converting csv file to a pandas DataFrame in order to manipulate data effectively and effeciently 
df = pd.read_csv('kc_house_data.csv')

#checking data
df.head()

#checking data types
df.info()

#checking for null values 
df.isna().sum()

#replacing null values 
#replacing null values with zero inorder to clean column  
df['yr_renovated'] = df['yr_renovated'].fillna(value=0)

#replaced null values with zero because we don't have a record showing the house had any prior views. 
df['view'] = df['view'].fillna(value=0)

#replaced null values with zero because this was a categorical data.
df['waterfront'] = df.waterfront.fillna(value=0)

#slicing rows in 'sqft_basement' that contains '?' in the values
df = df[df.sqft_basement != '?']
df.head(12)
#checking work
df.sqft_basement.value_counts()

# #checking for unqiue items in large null value columns 
print('waterfront:' ,df.waterfront.unique())
print('view:' , df.view.unique())
print('yr_renovated:' , df.yr_renovated.unique())

#converting data types
#sqft_basement, convert data type into an interger 
df.sqft_basement = df.sqft_basement.astype('float64')
df.waterfront = df.waterfront.astype('int64')
df.info()

#removing outlier in column 'bedrooms' 
df = df.drop(df[df.bedrooms >15].index)
df.bedrooms.value_counts()

#checking for general distribution 
df.hist(figsize= (30,25));

#checking to see which predictors are the most correlated to price 
df.corr().head(20)

#Dealing with outliers in two of the top three predictors 
df_one = df.drop(df[df.sqft_living > 8500].index)
df_one = df.drop(df[df.sqft_above > 8000].index)

#Dropping predictors lower than 10% correlation with 'price'
#columns dropped: ['sqft_lot', 'condition', 'yr_built','zipcode', 'long', 'sqft_lot15']
df_one = df.drop(['sqft_lot', 'condition', 'yr_built','zipcode', 'long', 'sqft_lot15'], axis = 1)

#Heatamp to visualize correleation of df_one 

# Set the style of the visualization
sns.set()

# Create a covariance matrix
corr = df_one.corr()

# Generate a mask the size of our covariance matrix
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(12, 150, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

#checking for linearity assumptions between predictors and targets in df_one 
for i in ['bedrooms', 'bathrooms', 'sqft_living', 'floors','waterfront', 'view', 'grade', 'sqft_above', 'sqft_basement' ,'yr_renovated', 'lat', 'sqft_living15']:
    sns.jointplot(x = i, y= 'price', data = df_one, kind= 'reg' ,label= i)
    
    plt.legend()
    plt.show()



# Modeling 

In [1]:
#Running an OLS rergression 
import statsmodels.api as sm 
from statsmodels.formula.api import ols 

outcome = 'price'
predictors = df_one.drop(['price', 'date'], axis=1)
pred_sum = "+".join(predictors.columns)
formula = outcome + '~' + pred_sum 

model = ols(formula= formula, data=df_one).fit()
model.summary()

NameError: name 'df_one' is not defined