In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings("ignore")

In [None]:
Housing = pd.read_csv("USA_Housing.csv")
Housing.head()

In [None]:
Housing.info()

In [None]:
Housing.describe()

# Data Preprocessing 
### 1) Handling missing data
### 2) Outlier Treatement
### 3) Feature Scaling

In [None]:
Housing.isnull().sum()

In [None]:
Housing.isnull().sum()/len(Housing)*100

In [None]:
sns.boxplot(y ='Avg. Area Income', data=Housing)
plt.show()

In [None]:
Housing['Avg. Area Income'] = Housing['Avg. Area Income'].fillna(Housing['Avg. Area Income'].median())

In [None]:
sns.boxplot(y ='Avg. Area Number of Rooms', data=Housing)
plt.show()

In [None]:
Housing['Avg. Area Number of Rooms'] = Housing['Avg. Area Number of Rooms'].fillna(Housing['Avg. Area Number of Rooms'].median())

In [None]:
sns.boxplot(y ='Avg. Area Number of Bedrooms', data=Housing)
plt.show()

In [None]:
Housing['Avg. Area Number of Bedrooms'] = Housing['Avg. Area Number of Bedrooms'].fillna(Housing['Avg. Area Number of Bedrooms'].mean())

In [None]:
Housing.isnull().sum()

In [None]:
Housing.info()

In [None]:
def boxplots(col):
    sns.boxplot(Housing[col])
    plt.show()

for i in list(Housing.select_dtypes(exclude=['object']).columns)[0:]:
    boxplots(i)

In [None]:
Housing.dtypes

In [None]:
# we have to handle outlier for the listed variables
# Avg. Area Income , Avg. Area House Age , Avg. Area Number of Rooms and Area Population

In [None]:
income_q1 = Housing['Avg. Area Income'].quantile(0.25)
income_q3 = Housing['Avg. Area Income'].quantile(0.75)
income_iqr =income_q3 - income_q1
lower_limit = income_q1 - 1.5*income_iqr
upper_limit = income_q3  + 1.5 * income_iqr

In [None]:
Housing['Avg. Area Income'] = np.where(Housing['Avg. Area Income'] > upper_limit, upper_limit,
                                      np.where(Housing['Avg. Area Income'] < lower_limit,
                                              lower_limit, Housing['Avg. Area Income']))

In [None]:
sns.boxplot(y ='Avg. Area Income', data=Housing)
plt.show()

In [None]:
income_q1 = Housing['Avg. Area House Age'].quantile(0.25)
income_q3 = Housing['Avg. Area House Age'].quantile(0.75)
income_iqr =income_q3 - income_q1
lower_limit = income_q1 - 1.5*income_iqr
upper_limit = income_q3  + 1.5 * income_iqr

In [None]:
Housing['Avg. Area House Age'] = np.where(Housing['Avg. Area House Age'] > upper_limit, upper_limit,
                                      np.where(Housing['Avg. Area House Age'] < lower_limit,
                                              lower_limit, Housing['Avg. Area House Age']))

In [None]:
income_q1 = Housing['Avg. Area Number of Rooms'].quantile(0.25)
income_q3 = Housing['Avg. Area Number of Rooms'].quantile(0.75)
income_iqr =income_q3 - income_q1
lower_limit = income_q1 - 1.5*income_iqr
upper_limit = income_q3  + 1.5 * income_iqr

In [None]:
Housing['Avg. Area Number of Rooms'] = np.where(Housing['Avg. Area Number of Rooms'] > upper_limit, upper_limit,
                                      np.where(Housing['Avg. Area Number of Rooms'] < lower_limit,
                                              lower_limit, Housing['Avg. Area Number of Rooms']))

In [None]:
income_q1 = Housing['Area Population'].quantile(0.25)
income_q3 = Housing['Area Population'].quantile(0.75)
income_iqr =income_q3 - income_q1
lower_limit = income_q1 - 1.5*income_iqr
upper_limit = income_q3  + 1.5 * income_iqr

In [None]:
Housing['Area Population'] = np.where(Housing['Area Population'] > upper_limit, upper_limit,
                                      np.where(Housing['Area Population'] < lower_limit,
                                              lower_limit, Housing['Area Population']))

In [None]:
def boxplots(col):
    sns.boxplot(Housing[col])
    plt.show()

for i in list(Housing.select_dtypes(exclude=['object']).columns)[0:]:
    boxplots(i)

In [None]:
x = Housing.iloc[:,0:-1] 
y = Housing.iloc[:,-1]

In [None]:
x.head()

In [None]:
y.head()

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
sc_x = scaler.fit_transform(x)
pd.DataFrame(sc_x)

# Finding correlation

In [None]:
plt.figure(figsize=(20,15))
#corr = Housing.corr()
sns.heatmap(Housing.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
# split the data into training and testing whereas training is for building a model and testing is for predicting and evaluation
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)

In [None]:
print(x_train.shape)
print(x_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

In [None]:
x_test

# Building Model - Linear Regression

### Approach no 1 - basic method

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(x_train, y_train)

In [None]:
print(lm.coef_)

In [None]:
print(lm.intercept_)

In [None]:
x.columns

In [None]:
Price = 2.17614093e+01*50000 + 1.67097451e+05*5+1.20854863e+05*3+1.89086080e+03*3+1.53789291e+01*50000 -2666909.114657062

In [None]:
Price

In [None]:
x_test

# Predict test dataset with linear model

In [None]:
y_pred = lm.predict(x_test)
y_pred

In [None]:
y_test

# Evaluation 

In [None]:
from sklearn.metrics import r2_score
print("Accuracy :", r2_score(y_test, y_pred))