In [13]:
import pandas as pd

from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from surprise import accuracy

## Loading Event Data

In [2]:
eventDataSet = pd.read_csv('data/FilteredEvents/capstone_event_2018_07.csv')
eventDataSet = eventDataSet.append(pd.read_csv('data/FilteredEvents/capstone_event_2018_08.csv'))
eventDataSet = eventDataSet.append(pd.read_csv('data/FilteredEvents/capstone_event_2018_09.csv'))
eventDataSet = eventDataSet.append(pd.read_csv('data/FilteredEvents/capstone_event_2018_10.csv'))
eventDataSet = eventDataSet.append(pd.read_csv('data/FilteredEvents/capstone_event_2018_11.csv'))
eventDataSet = eventDataSet.append(pd.read_csv('data/FilteredEvents/capstone_event_2018_12.csv'))
eventDataSet = eventDataSet.append(pd.read_csv('data/FilteredEvents/capstone_event_2019_01.csv'))
eventDataSet = eventDataSet.append(pd.read_csv('data/FilteredEvents/capstone_event_2019_02.csv'))
eventDataSet = eventDataSet.append(pd.read_csv('data/FilteredEvents/capstone_event_2019_03.csv'))
eventDataSet = eventDataSet.append(pd.read_csv('data/FilteredEvents/capstone_event_2019_04.csv'))

## Cleaning the Data

In [None]:
eventDataSet = eventDataSet.replace(to_replace=["Kiralık", "Satılik", "Satılık"], value = ["Kiralik", "Satilik", "Satilik"])

eventDataSet = eventDataSet[(eventDataSet.districtName.notnull())]

salesEventDataSet = eventDataSet[(eventDataSet.listingTypeId == 1)]
rentEventDataSet = eventDataSet[(eventDataSet.listingTypeId == 2)]

In [5]:
salesEventDataSet.head()

Unnamed: 0.1,Unnamed: 0,event_date,event_timestamp,user_pseudo_id,event_name,listingId,listingType,listingTypeId,subPropertyTypeId,propertyType,propertyTypeId,subPropertyType,cityId,cityName,countyId,countyName,districtId,districtName
35,35,20180701,1530478769081000,e83ce22dd4b1a09b3e501a1e3d658f7a,ListingView,1601671.0,Satilik,1.0,,Konut,,Daire,36701.0,Samsun,37367.0,İlkadım,37409.0,Kışla
43,43,20180701,1530478593368000,8067997e67c581182ff88248ce66bec6,ListingView,1553610.0,Satilik,1.0,,Konut,,Daire,29689.0,Manisa,30695.0,Turgutlu,30713.0,Turan
44,44,20180701,1530478643308000,8067997e67c581182ff88248ce66bec6,ListingView,1605030.0,Satilik,1.0,,Konut,,Daire,29689.0,Manisa,30695.0,Turgutlu,30713.0,Turan
45,45,20180701,1530478721787000,8067997e67c581182ff88248ce66bec6,ListingView,1539499.0,Satilik,1.0,,Konut,,Daire,29689.0,Manisa,30695.0,Turgutlu,30699.0,Yedi Eylül
72,72,20180701,1530478077640000,bd5e859d0ca2e93dce3dbd2fe6b596f7,ListingView,1548096.0,Satilik,1.0,,Konut,,Daire,1.0,İstanbul,229.0,Beylikdüzü,232.0,Cumhuriyet


## Preparing Data For Collaborative Filtering

In [6]:
ds = salesEventDataSet[(salesEventDataSet.cityId == 1) & (salesEventDataSet.subPropertyType.notnull()) & (salesEventDataSet.subPropertyType == 'Daire')]
ds["interest"] = ds.cityName + "_" + ds.countyName + "_" + ds.districtName #+ "_" + ds.subPropertyType
ds = ds.filter(['user_pseudo_id', 'interest'])
ds['count'] = 0
ratingDS = ds.groupby(['user_pseudo_id', 'interest'])['count'].count().reset_index()

ratingDS.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,user_pseudo_id,interest,count
0,00001238ac28333afc87b94a992f0319,İstanbul_Tuzla_İçmeler,2
1,0001960a98f9d0d716e75ae49cc0df7a,İstanbul_Kartal_Cevizli,4
2,0001960a98f9d0d716e75ae49cc0df7a,İstanbul_Kartal_Cumhuriyet,8
3,0001960a98f9d0d716e75ae49cc0df7a,İstanbul_Kartal_Esentepe,3
4,0001960a98f9d0d716e75ae49cc0df7a,İstanbul_Kartal_Gümüşpınar,5


## Calculating The Rating

In [7]:
userSumListing = ratingDS.groupby(['user_pseudo_id'])['count'].max().reset_index()
userSumListing.columns = ['user_pseudo_id', 'maxCount']

rating = pd.concat([ratingDS.set_index('user_pseudo_id'), userSumListing.set_index('user_pseudo_id')], axis=1,
                   join='inner').reset_index()
rating['rating'] = round(3 * rating['count'] / rating['maxCount']) + 2

final = rating.filter(['user_pseudo_id', 'interest', 'rating'])

final.head()

Unnamed: 0,user_pseudo_id,interest,rating
0,00001238ac28333afc87b94a992f0319,İstanbul_Tuzla_İçmeler,5.0
1,0001960a98f9d0d716e75ae49cc0df7a,İstanbul_Kartal_Cevizli,3.0
2,0001960a98f9d0d716e75ae49cc0df7a,İstanbul_Kartal_Cumhuriyet,3.0
3,0001960a98f9d0d716e75ae49cc0df7a,İstanbul_Kartal_Esentepe,2.0
4,0001960a98f9d0d716e75ae49cc0df7a,İstanbul_Kartal_Gümüşpınar,3.0


## Praparing Data for Suprise 

In [8]:
reader = Reader()
data = Dataset.load_from_df(final[['user_pseudo_id', 'interest', 'rating']], reader)

## Running Various Algorithms And Listing Their RMSE

In [9]:
def calculateRMSEForVariousAlghoritms(data):
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(),
                      KNNWithZScore(), BaselineOnly(), CoClustering()]:
        # Perform cross validation
        results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
        benchmark.append(tmp)

    surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
    print(surprise_results)
    

calculateRMSEForVariousAlghoritms(data)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


## Generating Model By Using KNN

In [14]:
trainset, testset = train_test_split(data, test_size=0.80)
algo = KNNBasic()
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.1958


1.1958020920388965

## List the 5 Closest Neighbours of İstabul Kağıthane Nurtepe

In [18]:
id = algo.trainset.to_inner_iid('İstanbul_Kağıthane_Nurtepe')

print("****" + algo.trainset.to_raw_iid(id) + "****")
neighbors = algo.get_neighbors(id, k=5)
neighbors = (algo.trainset.to_raw_iid(id) for id in neighbors)

for n in neighbors:
    print(n)

****İstanbul_Kağıthane_Nurtepe****
İstanbul_Bahçelievler_Bahçelievler
İstanbul_Ümraniye_Armağanevler
İstanbul_Esenler_Namık Kemal
İstanbul_Kartal_Çavuşoğlu
İstanbul_Bakırköy_Şenlikköy
