Description sentiment and image processing.

### Reading data

In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.feature_selection import f_regression
# import seaborn as sns
from scipy.stats import pearsonr
from PIL import Image

In [2]:
df = pd.read_json('../input/train.json')
# df.info()

### Description sentiment
Let's use nltk and Vader sentiment analyzer. It takes quite a while, so I will sample only a portion of the full dataset.

In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize




In [4]:
import nltk
#nltk.download('all')

In [20]:
def description_sentiment(sentences):
    analyzer = SentimentIntensityAnalyzer()
    result = []
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        result.append(vs)
    return pd.DataFrame(result).mean()
sdf = df.sample(5000,random_state=11)
sdf['description_tokens'] = sdf['description'].apply(sent_tokenize)
sdf = pd.concat([sdf,sdf['description_tokens'].apply(description_sentiment)],axis=1)

In [12]:
sdf.head(5)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address,description_tokens,compound,neg,neu,pos
2191,1.0,1,1932a605b95481fb3b0f82c5f1978c1b,2016-06-26 04:07:37,This is the place to be if you want some excit...,Clinton Street,"[Dining Room, Hardwood Floors, Dogs Allowed, C...",medium,40.7197,7219225,-73.9849,d23f9a9e6b9f6003f26017bda6dd1cb2,[https://photos.renthop.com/2/7219225_77a07385...,2090,55 Clinton Street,[This is the place to be if you want some exci...,0.251357,0.0,0.892429,0.107571
78497,1.0,1,2787598123c55dbf45b514958909c79c,2016-05-21 05:28:11,*****AVAILABLE NOW***MUST SEE***PRIME LOCATION...,W 42nd St,"[Swimming Pool, Dining Room, Doorman, Elevator...",low,40.761,7049630,-73.999,54bf50a3e709c0bd499f9fdca5826147,[https://photos.renthop.com/2/7049630_39409c9f...,3590,620 W 42nd St,[*****AVAILABLE NOW***MUST SEE***PRIME LOCATIO...,0.87185,0.0,0.9115,0.0885
17732,1.0,1,0de97434d3c89fde23d42429f5867ae0,2016-06-02 06:59:59,This spacious true 1 bedroom unit features bea...,West Street,"[Doorman, Elevator, Fitness Center, Laundry in...",high,40.7069,7099695,-74.0157,56d5b16222f138a23725efa0baee9889,[https://photos.renthop.com/2/7099695_11aeaf97...,2500,20 West Street,[This spacious true 1 bedroom unit features be...,0.461225,0.006,0.853,0.141
68497,1.0,1,f9fcf771a66999dab0500eaec81f9dd6,2016-05-06 06:01:06,88TH STREET! GORGEOUS 1 BEDROOM WITH GREAT FIN...,East 88th Street,"[Dishwasher, Hardwood Floors, No Fee]",medium,40.7773,6976051,-73.9469,cb87dadbca78fad02b388dc9e8f25a5b,[https://photos.renthop.com/2/6976051_55b8c0c1...,2600,444 East 88th Street,"[88TH STREET!, GORGEOUS 1 BEDROOM WITH GREAT F...",0.205279,0.007643,0.837714,0.154643
107259,1.0,1,0,2016-04-22 01:18:44,,North 10th Street,"[Swimming Pool, Doorman, Fitness Center, Dogs ...",low,40.7176,6909240,-73.9531,f45a2445c177379cd795189950ebba4f,[],3343,250 North 10th Street,[],,,,


In [21]:
tmp=sdf['neg']
sdf=sdf[~pd.isnull(tmp)]
interest_level_map = {'low': 0, 'medium': 1, 'high': 2}
sdf['interest_level'] = sdf['interest_level'].apply(lambda x: interest_level_map[x])

In [22]:
X = sdf[['neg','neu']]
y = sdf["interest_level"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

model=linear_model.LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_val)
r2_score(y_val,y_pred)

0.00012812028196351744

In [23]:
f_regression(X_train, y_train)

(array([ 4.40256428,  4.64491071]), array([ 0.03596508,  0.03122196]))

In [24]:
df['description_tokens'] = df['description'].apply(sent_tokenize)
df = pd.concat([df,df['description_tokens'].apply(description_sentiment)],axis=1)
df[['neg','neu','pos']].to_csv("sentimentFeatureAdded.csv", index = False, header = True)

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, sharex=True,figsize=(8,16))
sns.violinplot(x="interest_level",y="compound",data=sdf,ax=ax1,order =['low','medium','high'])
sns.violinplot(x="interest_level",y="neg",data=sdf,ax=ax2)
sns.violinplot(x="interest_level",y="pos",data=sdf,ax=ax3)
sns.violinplot(x="interest_level",y="neu",data=sdf,ax=ax4)

### Photo properties

Let's also explore the possible correlations between features and simple high-level properties of images without going into NN. Let's look at the number of images, image size, brightness, hue and saturation.

In [None]:
# Get available images
from subprocess import check_output
images = [int(x) for x in check_output(["ls", "../input/images_sample"]).decode("utf8").strip().split('\n')]

# Read the train set and choose those which have images only
df = df[df.listing_id.isin(images)]
print(df.shape)

# Add number of images
df['n_images'] = df.apply(lambda x: len(x['photos']), axis=1)

In [None]:
# this is what we are after
check_output(["ls", "../input/images_sample/6812223"]).decode("utf8").strip().split('\n')

In [None]:
#function to process one image
def process_image(path):
    path = '../input/images_sample/'+path[0:7]+'/'+path
    im = np.array(Image.open(path))

    #get dims
    width = im.shape[1]
    height = im.shape[0]
    
    #flatten image
    im = im.transpose(2,0,1).reshape(3,-1)
   
    
    #brightness is simple, assign 1 if zero to avoid divide
    brg = np.amax(im,axis=0)
    brg[brg==0] = 1
    
    #hue, same, assign 1 if zero, not working atm due to arccos
    denom = np.sqrt((im[0]-im[1])**2-(im[0]-im[2])*(im[1]-im[2]))
    denom[denom==0] = 1
    #hue = np.arccos(0.5*(2*im[0]-im[1]-im[2])/denom)
    
    #saturation
    sat = (brg - np.amin(im,axis=0))/brg
    
    #return mean values
    return width,height,np.mean(brg),np.mean(sat)

In [None]:
#second helper function - process a row of a dataset
#return mean of each property for all images
def process_row(row):
    images = check_output(["ls", "../input/images_sample/"+str(row.listing_id)]).decode("utf8").strip().split('\n')
    res = np.array([process_image(x) for x in images])
    res = np.mean(res,axis=0)
    row['img_width'] = res[0]
    row['img_height'] = res[1]
    row['img_brightness'] = res[2]
    row['img_saturation'] = res[3]
    return row

In [None]:
#Now we can process the dataset
df = df.apply(lambda row: process_row(row),axis=1)

In [None]:
#Some plots
d = df[['img_width','n_images','img_height','img_brightness','img_saturation','interest_level']]
sns.pairplot(d, hue="interest_level",size=1.5)

Looks like it is all over the place, so it is unlikely to be a good feature. It is easy to calculate so perhaps still worth a try on a full image dataset.