In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("final_zillow.csv", dtype = str, header = 1)
df.head()

Unnamed: 0,Street Address,City,County,State,Country,Zipcode,Ziilow Id,Property Url,Bedrooms,Bathrooms,...,Listing Type,Agent Name,Agent Phone Number,Agent Email,Agent License Number,Agency,MLS ID,MLS Name,HOA Fee,Pictures
0,829 W 66th St,Los Angeles,Los Angeles County,CA,USA,90044,20934009,https://www.zillow.com/homedetails/829-W-66th-...,3,1.0,...,Unknown Listed By,Lillie Reda,323-747-3640,lilliec1@att.net,DRE #01084559,Excellence Real Estate,DW22007910,CRMLS,,https://photos.zillowstatic.com/fp/a0f5b38d972...
1,9673 Via Torino #154,Burbank,Los Angeles County,CA,USA,91504,20033668,https://www.zillow.com/homedetails/9673-Via-To...,2,2.0,...,For Rent,,,jerry@teamrockproperties.com,,,,Zillow Rental Manager,$439 monthly,https://photos.zillowstatic.com/fp/22c59ef7419...
2,9673 Via Torino,Burbank,Los Angeles County,CA,USA,91504,2088531295,https://www.zillow.com/homedetails/9673-Via-To...,2,1.5,...,Unknown Listed By,,,,,,,,,https://photos.zillowstatic.com/fp/c960fd9b761...
3,13621 Simshaw Ave,Sylmar,Los Angeles County,CA,USA,91342,20102436,https://www.zillow.com/homedetails/13621-Simsh...,3,2.0,...,Unknown Listed By,Audrey Overton,,audrey@audreyoverton.com,DRE #02007589,Century 21 Everest,SR22032355,CRMLS,,https://photos.zillowstatic.com/fp/f41d7ed87a0...
4,4600 Don Lorenzo Dr APT 30,Los Angeles,Los Angeles County,CA,USA,90008,61361676,https://www.zillow.com/homedetails/4600-Don-Lo...,2,3.0,...,Unknown Listed By,Stefanie Mcintyre,818-321-6522,soldbystefanie@kw.com,DRE # 01874509,KW Advisors,22-117173,CLAW,$404 monthly,https://photos.zillowstatic.com/fp/6a763dd674b...


In [3]:
drop_cols = ['Property Url', 'Country', 'Ziilow Id', 'Living Area Unit', 'Cooling', 'Fireplaces', 'Heating',
            'Parking', 'Currency', 'Time On Zillow', 'Latitude', 'Longitude',
            'Listing Type', 'Agent Phone Number', 'Agent Email', 'Agent License Number',
            'MLS ID', 'MLS Name', 'HOA Fee', 'Pictures', 'Street Address']

In [4]:
df = df.drop(drop_cols, axis = 1)

In [5]:
df['Description'] = df['Description'].astype(str)

## Basic Feature Extraction

### Word Count

In [6]:
df['word_count'] = df['Description'].apply(lambda x: len(str(x).split(" ")))

In [7]:
df.head()

Unnamed: 0,City,County,State,Zipcode,Bedrooms,Bathrooms,Property Status,Property Type,Living Area,Zestimate,...,Parking Capacity,Price,Year Built,Description,Days On Zillow,View Count,Favourite Count,Agent Name,Agency,word_count
0,Los Angeles,Los Angeles County,CA,90044,3,1.0,RECENTLY_SOLD,SINGLE_FAMILY,1044,662700,...,1,685000,1921.0,Huge Development or Investment Potential!. Thi...,3,31,0,Lillie Reda,Excellence Real Estate,84
1,Burbank,Los Angeles County,CA,91504,2,2.0,FOR_RENT,TOWNHOUSE,1151,649951,...,2,3500,1975.0,"Fully remodeled, Corner Unit townhome in the C...",3,359,22,,,146
2,Burbank,Los Angeles County,CA,91504,2,1.5,RECENTLY_SOLD,TOWNHOUSE,1151,887400,...,0,685000,,Cabrini Villas.Top of Hill.Private/Quiet/Grass...,3,9,0,,,86
3,Sylmar,Los Angeles County,CA,91342,3,2.0,RECENTLY_SOLD,SINGLE_FAMILY,1274,623000,...,4,688000,1959.0,SOLD BEFORE PROCESSING - Welcome home to this ...,4,23,0,Audrey Overton,Century 21 Everest,73
4,Los Angeles,Los Angeles County,CA,90008,2,3.0,RECENTLY_SOLD,TOWNHOUSE,1151,664700,...,2,700000,2002.0,This tri-level end unit is one of Just 55 unit...,4,28,0,Stefanie Mcintyre,KW Advisors,143


### Average Word Length

In [8]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

In [9]:
df['avg_word'] = df['Description'].apply(lambda x: avg_word(x))
df[['Description','avg_word']].head()

Unnamed: 0,Description,avg_word
0,Huge Development or Investment Potential!. Thi...,4.542169
1,"Fully remodeled, Corner Unit townhome in the C...",4.829932
2,Cabrini Villas.Top of Hill.Private/Quiet/Grass...,6.209302
3,SOLD BEFORE PROCESSING - Welcome home to this ...,4.485714
4,This tri-level end unit is one of Just 55 unit...,5.204225


### Stopwords

In [10]:
import nltk

In [11]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

df['stopwords'] = df['Description'].apply(lambda x: len([x for x in x.split() if x in stop]))
df[['Description','stopwords']].head()

Unnamed: 0,Description,stopwords
0,Huge Development or Investment Potential!. Thi...,31
1,"Fully remodeled, Corner Unit townhome in the C...",47
2,Cabrini Villas.Top of Hill.Private/Quiet/Grass...,14
3,SOLD BEFORE PROCESSING - Welcome home to this ...,24
4,This tri-level end unit is one of Just 55 unit...,37


### Special Characters - Exclamation

In [12]:
df['chars'] = df['Description'].apply(lambda x: len([x for x in x.split() if '!' in x]))
df[['Description','chars']].head()

Unnamed: 0,Description,chars
0,Huge Development or Investment Potential!. Thi...,1
1,"Fully remodeled, Corner Unit townhome in the C...",1
2,Cabrini Villas.Top of Hill.Private/Quiet/Grass...,0
3,SOLD BEFORE PROCESSING - Welcome home to this ...,0
4,This tri-level end unit is one of Just 55 unit...,0


### Numerics

In [13]:
df['numerics'] = df['Description'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df[['Description','numerics']].head()

Unnamed: 0,Description,numerics
0,Huge Development or Investment Potential!. Thi...,2
1,"Fully remodeled, Corner Unit townhome in the C...",4
2,Cabrini Villas.Top of Hill.Private/Quiet/Grass...,2
3,SOLD BEFORE PROCESSING - Welcome home to this ...,2
4,This tri-level end unit is one of Just 55 unit...,2


### Uppercase

In [14]:
df['upper'] = df['Description'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
df[['Description','upper']].head()

Unnamed: 0,Description,upper
0,Huge Development or Investment Potential!. Thi...,2
1,"Fully remodeled, Corner Unit townhome in the C...",3
2,Cabrini Villas.Top of Hill.Private/Quiet/Grass...,2
3,SOLD BEFORE PROCESSING - Welcome home to this ...,3
4,This tri-level end unit is one of Just 55 unit...,2


## Basic Pre-processing

In [15]:
# Convert to Lowercase
df['description'] = df['Description'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# Remove punctuation
df['description'] = df['description'].str.replace('[^\w\s]','')

### Remove noisy words

In [16]:
# Remove stop words
df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# Remove common words
freq1 = pd.Series(' '.join(df['description']).split()).value_counts()[:10]
freq1 = list(freq1.index)
df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if x not in freq1))

# Remove rare words
freq2 = pd.Series(' '.join(df['description']).split()).value_counts()[-10:]
freq2 = list(freq2.index)
df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if x not in freq2))

# Analysis

https://smltar.com/mlregression.html#mlregressionfull

## Tokenize

In [17]:
#from nltk import word_tokenize

In [18]:
#df['descrip'] = [word_tokenize(entry) for entry in df['description']]

### Lemmatization (Not yet Done)

In [19]:
from textblob import Word
from nltk.stem import WordNetLemmatizer

In [20]:
# lemmatizer = WordNetLemmatizer()
# #' '.join([wnl.lemmatize(words) for words in df['descrip'][0]])
# # def lemmatize_text(text):
# #     return [lemmatizer.lemmatize(w) for w in text]

# def lemmatize_text(text):
#     return [lemmatizer.lemmatize(text)]

# df['description'] = df['description'].apply(lemmatize_text)

## Train-Test Split

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
X = df['description']
y = df['Price']

NameError: name 'df' is not defined

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

## Count Vectorizer & TF-IDF

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

## Apply Linear SVM