In [1]:
%matplotlib inline

import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS

In [2]:
data = pd.read_json("train.json")

In [3]:
low = data[data["interest_level"] == "low"]

In [4]:
low.shape[0]

34284

In [5]:
medium = data[data["interest_level"] == "medium"]

In [6]:
medium.shape[0]

11229

In [7]:
high = data[data["interest_level"] == "high"]

In [8]:
high.shape[0]

3839

In [None]:
def gen_word_cloud_png(df, title=None):
    vect = CountVectorizer(stop_words="english", ngram_range=(1,3),max_features=2000)
    counts = vect.fit_transform(df["description"]).toarray().sum(axis=1)
    cloud = WordCloud()
    f_c = cloud.generate_from_frequencies(zip(vect.get_feature_names(), counts))
    if title:
        f_c.to_file(title + ".png")
    plt.imshow(f_c)
    plt.axis('off')
    plt.show()

In [None]:
gen_word_cloud_png(low)

In [None]:
gen_word_cloud_png(medium)

In [None]:
gen_word_cloud_png(high)

In [3]:
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS.union(
    ["br", "li", "target", "_blank", "href", "div", "apartment", "building", "new"]
)  
tfidf = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 3), min_df=0.10)
tfidf.fit(data["description"])

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=frozenset(['all', 'show', 'anyway', 'fifty', 'four', 'go', 'mill', 'find', 'seemed', 'whose', 'apartment', 're', 'herself', 'whoever', 'behind', 'should', 'to', 'only', 'under', 'herein', 'do', 'his', 'get', 'very', 'de', 'myself', 'cannot', 'every', 'yourselves', 'him', 'is', 'cry', 'bef... 'eight', 'but', 'nothing', 'why', 'building', 'noone', 'sometimes', 'together', 'serious', 'once']),
        strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
def tfidf_cloud(df, tfidf, lim=10):
    st = " ".join(df["description"].tolist())
    word_tup = zip(tfidf.get_feature_names(), tfidf.transform([st]).toarray()[0])
#     gen = WordCloud().generate_from_frequencies(word_tup)
#     plt.imshow(gen)
#     plt.axis('off')
#     plt.show()
    return sorted(word_tup, key=lambda x: x[1], reverse=True)[0:lim]

In [20]:
tfidf_cloud(low, tfidf, 20)

[(u'website_redacted', 0.15837406271274532),
 (u'bedroom', 0.15580609027953818),
 (u'room', 0.151932375874682),
 (u'com', 0.15187034737840813),
 (u'kitchen', 0.1473294768294206),
 (u'space', 0.1245100521684487),
 (u'features', 0.11890535019906855),
 (u'large', 0.11855862959896117),
 (u'floors', 0.11825978580002955),
 (u'appliances', 0.11775492246658038),
 (u'great', 0.11543517347565189),
 (u'renovated', 0.11372176286298266),
 (u'unit', 0.11090102586405162),
 (u'stainless', 0.11000741988519977),
 (u'hardwood', 0.10988516323089832),
 (u'kagglemanager', 0.10892825969827945),
 (u'kagglemanager renthop', 0.10892825969827945),
 (u'renthop', 0.10892825969827945),
 (u'kagglemanager renthop com', 0.1089098019740232),
 (u'renthop com', 0.1089098019740232)]

In [21]:
tfidf_cloud(medium, tfidf, 20)

[(u'website_redacted', 0.15476970135345916),
 (u'bedroom', 0.15457188047915024),
 (u'kitchen', 0.15133725813454763),
 (u'room', 0.14639723438453117),
 (u'space', 0.1358821655495723),
 (u'com', 0.13250065096716071),
 (u'large', 0.1311222171999408),
 (u'renovated', 0.12571830622693789),
 (u'kagglemanager', 0.12217365499987751),
 (u'kagglemanager renthop', 0.12217365499987751),
 (u'kagglemanager renthop com', 0.12217365499987751),
 (u'renthop', 0.12217365499987751),
 (u'renthop com', 0.12217365499987751),
 (u'fee', 0.1199953272891076),
 (u'appliances', 0.11818960386129324),
 (u'floors', 0.11741628183180162),
 (u'great', 0.11577712600441403),
 (u'unit', 0.11534259644919928),
 (u'hardwood', 0.11467264476165316),
 (u'located', 0.10940829869087275)]

In [22]:
tfidf_cloud(high, tfidf, 20)

[(u'bedroom', 0.16566446867942994),
 (u'kitchen', 0.1626109662960703),
 (u'large', 0.15455816917431231),
 (u'room', 0.15187911925554132),
 (u'space', 0.14937712802175274),
 (u'website_redacted', 0.14620714624436171),
 (u'renovated', 0.1379478170377286),
 (u'floors', 0.12829956942834125),
 (u'hardwood', 0.12407345096738676),
 (u'com', 0.12056447018150729),
 (u'unit', 0.11734207210217248),
 (u'appliances', 0.11707077727663909),
 (u'located', 0.11655527094575181),
 (u'kagglemanager', 0.11477428529316086),
 (u'kagglemanager renthop', 0.11477428529316086),
 (u'kagglemanager renthop com', 0.11477428529316086),
 (u'renthop', 0.11477428529316086),
 (u'renthop com', 0.11477428529316086),
 (u'great', 0.11471613801861898),
 (u'fee', 0.11454293442012639)]

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
params = {"n_estimators": range(10, 110, 10), "min_samples_leaf": range(10, 50, 10), "max_features": ["auto", "sqrt"]}
rf = RandomForestClassifier(n_jobs=-1)
grid_search = GridSearchCV(rf, param_grid=params, n_jobs=-1)

In [5]:
vect = CountVectorizer(stop_words=stop_words, ngram_range=(1,3),max_features=2000)
counts = vect.fit_transform(data["description"]).toarray()

In [6]:
c_df = pd.DataFrame(counts, columns=vect.get_feature_names())

In [7]:
def assign_class(x):
    return ["high", "medium", "low"].index(x)

In [8]:
rf.fit(c_df.values, data["interest_level"].map(lambda x: assign_class(x)).values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [13]:
sorted(zip(c_df.columns,  rf.feature_importances_), key = lambda x: x[1], reverse=True)

[(u'fee', 0.0057859398977284631),
 (u'bedroom', 0.0057670518570886262),
 (u'studio', 0.005233473716927806),
 (u'website_redacted', 0.004996437248962436),
 (u'large', 0.0040511179120899139),
 (u'kitchen', 0.0040233864096857561),
 (u'renovated', 0.0039281999641304728),
 (u'great', 0.0037498642527657097),
 (u'room', 0.0034659966016879194),
 (u'text', 0.0033593134296311658),
 (u'flex', 0.0033533815474159892),
 (u'unit', 0.0032301991064885766),
 (u'location', 0.0031448330161113038),
 (u'space', 0.0029771554627467055),
 (u'beautiful', 0.0028947464306938739),
 (u'email', 0.002862064357242843),
 (u'located', 0.0028567328849678072),
 (u'east', 0.0028479766453838257),
 (u'spacious', 0.0028037991479135287),
 (u'laundry', 0.0027286265025329788),
 (u'amazing', 0.002663189256765603),
 (u'features', 0.0026459224992249819),
 (u'living', 0.0026246909290816647),
 (u'hardwood', 0.0026189774570525002),
 (u'floors', 0.0026163557135265429),
 (u'closet', 0.0025959365112603961),
 (u'bathroom', 0.0025766322625