In [67]:
import pandas as pd
import numpy as np
import matplotlib 
from matplotlib import pyplot as plt
%matplotlib inline
matplotlib.rcParams["figure.figsize"] = (20,10)

In [68]:
df = pd.read_csv("London.csv")
df.head()

Unnamed: 0,Property,Price,Type,Area_ft,Bedrooms,Bathrooms,Living_rooms,Location,City,Postalcode
0,Queens Road,1675000,House,2716,5,5,5,Wimbledon,London,SW19 8NY
1,Seward Street,650000,Flat / Apartment,814,2,2,2,Clerkenwell,London,EC1V 3PA
2,Hotham Road,735000,Flat / Apartment,761,2,2,2,Putney,London,SW15 1QL
3,Festing Road,1765000,House,1986,4,4,4,Putney,London,SW15 1LP
4,Spencer Walk,675000,Flat / Apartment,700,2,2,2,Putney,London,SW15 1PL


In [69]:
df = df.drop(["Property","City","Postalcode","Type","Living_rooms"],axis="columns")
df.head()

Unnamed: 0,Price,Area_ft,Bedrooms,Bathrooms,Location
0,1675000,2716,5,5,Wimbledon
1,650000,814,2,2,Clerkenwell
2,735000,761,2,2,Putney
3,1765000,1986,4,4,Putney
4,675000,700,2,2,Putney


In [70]:
df.isnull().sum()

Price          0
Area_ft        0
Bedrooms       0
Bathrooms      0
Location     962
dtype: int64

In [71]:
df = df.dropna()
df.isnull().sum()

Price        0
Area_ft      0
Bedrooms     0
Bathrooms    0
Location     0
dtype: int64

In [72]:
df["Price_per_sqft"] = df["Price"]/df["Area_ft"]
df.head()

Unnamed: 0,Price,Area_ft,Bedrooms,Bathrooms,Location,Price_per_sqft
0,1675000,2716,5,5,Wimbledon,616.715758
1,650000,814,2,2,Clerkenwell,798.525799
2,735000,761,2,2,Putney,965.834428
3,1765000,1986,4,4,Putney,888.721047
4,675000,700,2,2,Putney,964.285714


In [73]:
len(df.Location.unique())
Location_val = df.Location.value_counts().sort_values(ascending = False)

In [74]:
len(Location_val[Location_val<=4])
location_4less = Location_val[Location_val <= 4]

In [75]:
df.Location = df.Location.apply(lambda x: "other" if x in location_4less else x)
len(df.Location.unique())
df = df.drop(["Price_per_sqft"], axis = "columns")
df.head()

Unnamed: 0,Price,Area_ft,Bedrooms,Bathrooms,Location
0,1675000,2716,5,5,Wimbledon
1,650000,814,2,2,Clerkenwell
2,735000,761,2,2,Putney
3,1765000,1986,4,4,Putney
4,675000,700,2,2,Putney


In [87]:
dummies = pd.get_dummies(df.Location)

In [89]:
df = pd.concat([df,dummies.drop(["other"], axis = "columns")],axis = "columns")
df.head()

Unnamed: 0,Price,Area_ft,Bedrooms,Bathrooms,Location,Barnes,Battersea,Canary Wharf,Chelsea,Chiswick,...,St. James's,St. John's Wood,Surbiton,Thames Ditton,Walton-on-Thames,Wandsworth,Water Lane,Westminster,Wimbledon,Woodford Green
0,1675000,2716,5,5,Wimbledon,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,650000,814,2,2,Clerkenwell,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,735000,761,2,2,Putney,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1765000,1986,4,4,Putney,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,675000,700,2,2,Putney,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
X = df.drop(["Price","Location"], axis = "columns")
y = df.Price

In [91]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 15)
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)


0.5413813448050456

In [93]:
def predict(Location,Area_ft,Bedrooms,Bathrooms):
    loc_index = np.where(X.columns==Location)[0][0]
    
    x = np.zeros(len(X.columns))
    x[0] = Area_ft
    x[1] = Bedrooms
    x[2] = Bathrooms
    if loc_index >= 0:
        x[loc_index] = 1
        
    return lr_clf.predict([x])[0]    
    

In [99]:
predict("Canary Wharf",1000,2,2)



1051401.9128350527

In [103]:
import pickle
with open('London code.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [101]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))